From f7c475b8dfc23d461a47dfac5e498f8cc96faea5 Mon Sep 17 00:00:00 2001 From: xinhui pan Date: Wed, 24 Feb 2021 11:28:08 +0800 Subject: [PATCH 001/263] drm/ttm: Do not add non-system domain BO into swap list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BO would be added into swap list if it is validated into system domain. If BO is validated again into non-system domain, say, VRAM domain. It actually should not be in the swap list. Acked-by: Alex Deucher Signed-off-by: xinhui pan Signed-off-by: Alex Deucher Reviewed-by: Christian König Link: https://patchwork.freedesktop.org/patch/msgid/20210224032808.150465-1-xinhui.pan@amd.com Signed-off-by: Christian König --- drivers/gpu/drm/ttm/ttm_bo.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index 101a68dc615b6..799ec7a7caa4d 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -153,6 +153,8 @@ void ttm_bo_move_to_lru_tail(struct ttm_buffer_object *bo, swap = &ttm_bo_glob.swap_lru[bo->priority]; list_move_tail(&bo->swap, swap); + } else { + list_del_init(&bo->swap); } if (bdev->driver->del_from_lru_notify) From ffe8768fb8f391cb478466778c55e2110525c15c Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Thu, 15 Apr 2021 16:45:25 +0800 Subject: [PATCH 002/263] drm/vc4: remove unused function Fix the following clang warning: drivers/gpu/drm/vc4/vc4_vec.c:201:1: warning: unused function 'to_vc4_vec_connector' [-Wunused-function]. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/1618476325-112629-1-git-send-email-jiapeng.chong@linux.alibaba.com --- drivers/gpu/drm/vc4/vc4_vec.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/gpu/drm/vc4/vc4_vec.c b/drivers/gpu/drm/vc4/vc4_vec.c index bd5b8eb58b180..090529d0d5dcd 100644 --- a/drivers/gpu/drm/vc4/vc4_vec.c +++ b/drivers/gpu/drm/vc4/vc4_vec.c @@ -197,12 +197,6 @@ struct vc4_vec_connector { struct drm_encoder *encoder; }; -static inline struct vc4_vec_connector * -to_vc4_vec_connector(struct drm_connector *connector) -{ - return container_of(connector, struct vc4_vec_connector, base); -} - enum vc4_vec_tv_mode_id { VC4_VEC_TV_MODE_NTSC, VC4_VEC_TV_MODE_NTSC_J, From 10f76165d30bf568214e75767f2d8d8682cd4040 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Mon, 26 Apr 2021 16:53:25 -0700 Subject: [PATCH 003/263] drm/msm: Do not unpin/evict exported dma-buf's Our initial logic for excluding dma-bufs was not quite right. In particular we want msm_gem_get/put_pages() path used for exported dma-bufs to increment/decrement the pin-count. Also, in case the importer is vmap'ing the dma-buf, we need to be sure to update the object's status, because it is now no longer potentially evictable. Fixes: 63f17ef83428 drm/msm: Support evicting GEM objects to swap Signed-off-by: Rob Clark Link: https://lore.kernel.org/r/20210426235326.1230125-1-robdclark@gmail.com Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/msm_gem.c | 16 +++++++++++++++- drivers/gpu/drm/msm/msm_gem.h | 4 ++-- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c index b199942266a26..56df86e5f7400 100644 --- a/drivers/gpu/drm/msm/msm_gem.c +++ b/drivers/gpu/drm/msm/msm_gem.c @@ -190,13 +190,25 @@ struct page **msm_gem_get_pages(struct drm_gem_object *obj) } p = get_pages(obj); + + if (!IS_ERR(p)) { + msm_obj->pin_count++; + update_inactive(msm_obj); + } + msm_gem_unlock(obj); return p; } void msm_gem_put_pages(struct drm_gem_object *obj) { - /* when we start tracking the pin count, then do something here */ + struct msm_gem_object *msm_obj = to_msm_bo(obj); + + msm_gem_lock(obj); + msm_obj->pin_count--; + GEM_WARN_ON(msm_obj->pin_count < 0); + update_inactive(msm_obj); + msm_gem_unlock(obj); } int msm_gem_mmap_obj(struct drm_gem_object *obj, @@ -646,6 +658,8 @@ static void *get_vaddr(struct drm_gem_object *obj, unsigned madv) ret = -ENOMEM; goto fail; } + + update_inactive(msm_obj); } return msm_obj->vaddr; diff --git a/drivers/gpu/drm/msm/msm_gem.h b/drivers/gpu/drm/msm/msm_gem.h index a6480d2c81b2c..03e2cc2a2ce15 100644 --- a/drivers/gpu/drm/msm/msm_gem.h +++ b/drivers/gpu/drm/msm/msm_gem.h @@ -221,7 +221,7 @@ static inline bool is_active(struct msm_gem_object *msm_obj) /* imported/exported objects are not purgeable: */ static inline bool is_unpurgeable(struct msm_gem_object *msm_obj) { - return msm_obj->base.dma_buf && msm_obj->base.import_attach; + return msm_obj->base.import_attach || msm_obj->pin_count; } static inline bool is_purgeable(struct msm_gem_object *msm_obj) @@ -271,7 +271,7 @@ static inline void mark_unpurgeable(struct msm_gem_object *msm_obj) static inline bool is_unevictable(struct msm_gem_object *msm_obj) { - return is_unpurgeable(msm_obj) || msm_obj->pin_count || msm_obj->vaddr; + return is_unpurgeable(msm_obj) || msm_obj->vaddr; } static inline void mark_evictable(struct msm_gem_object *msm_obj) From 4b95d371fb001185af84d177e69a23d55bd0167a Mon Sep 17 00:00:00 2001 From: Jonathan Marek Date: Fri, 23 Apr 2021 21:49:26 -0400 Subject: [PATCH 004/263] drm/msm: fix LLC not being enabled for mmu500 targets mmu500 targets don't have a "cx_mem" region, set llc_mmio to NULL in that case to avoid the IS_ERR() condition in a6xx_llc_activate(). Fixes: 3d247123b5a1 ("drm/msm/a6xx: Add support for using system cache on MMU500 based targets") Signed-off-by: Jonathan Marek Link: https://lore.kernel.org/r/20210424014927.1661-1-jonathan@marek.ca Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c index d553f62f4eeb8..b4d8e1b01ee4f 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c @@ -1153,10 +1153,6 @@ static void a6xx_llc_slices_init(struct platform_device *pdev, { struct device_node *phandle; - a6xx_gpu->llc_mmio = msm_ioremap(pdev, "cx_mem", "gpu_cx"); - if (IS_ERR(a6xx_gpu->llc_mmio)) - return; - /* * There is a different programming path for targets with an mmu500 * attached, so detect if that is the case @@ -1166,6 +1162,11 @@ static void a6xx_llc_slices_init(struct platform_device *pdev, of_device_is_compatible(phandle, "arm,mmu-500")); of_node_put(phandle); + if (a6xx_gpu->have_mmu500) + a6xx_gpu->llc_mmio = NULL; + else + a6xx_gpu->llc_mmio = msm_ioremap(pdev, "cx_mem", "gpu_cx"); + a6xx_gpu->llc_slice = llcc_slice_getd(LLCC_GPU); a6xx_gpu->htw_llc_slice = llcc_slice_getd(LLCC_GPUHTW); From 08811c057b3e22f7a3df3955c138a59f3b651df0 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Sat, 10 Apr 2021 04:19:01 +0300 Subject: [PATCH 005/263] drm/msm/dsi: dsi_phy_28nm_8960: fix uninitialized variable access The parent_name initialization was lost in refactoring, restore it now. Fixes: 5d13459650b3 ("drm/msm/dsi: push provided clocks handling into a generic code") Reported-by: kernel test robot Signed-off-by: Dmitry Baryshkov Reviewed-by: Abhinav Kumar Link: https://lore.kernel.org/r/20210410011901.1735866-1-dmitry.baryshkov@linaro.org Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/dsi/phy/dsi_phy_28nm_8960.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/msm/dsi/phy/dsi_phy_28nm_8960.c b/drivers/gpu/drm/msm/dsi/phy/dsi_phy_28nm_8960.c index 582b1428f9715..86e40a0d41a3b 100644 --- a/drivers/gpu/drm/msm/dsi/phy/dsi_phy_28nm_8960.c +++ b/drivers/gpu/drm/msm/dsi/phy/dsi_phy_28nm_8960.c @@ -405,6 +405,10 @@ static int pll_28nm_register(struct dsi_pll_28nm *pll_28nm, struct clk_hw **prov if (!vco_name) return -ENOMEM; + parent_name = devm_kzalloc(dev, 32, GFP_KERNEL); + if (!parent_name) + return -ENOMEM; + clk_name = devm_kzalloc(dev, 32, GFP_KERNEL); if (!clk_name) return -ENOMEM; From 094c7f39ba4b5ae7e4c448527834428b79e3baf9 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Mon, 12 Apr 2021 03:01:58 +0300 Subject: [PATCH 006/263] drm/msm/dsi: fix msm_dsi_phy_get_clk_provider return code msm_dsi_phy_get_clk_provider() always returns two provided clocks, so return 0 instead of returning incorrect -EINVAL error code. Fixes: 5d13459650b3 ("drm/msm/dsi: push provided clocks handling into a generic code") Signed-off-by: Dmitry Baryshkov Reviewed-by: Abhinav Kumar Tested-by: Jonathan Marek Link: https://lore.kernel.org/r/20210412000158.2049066-1-dmitry.baryshkov@linaro.org Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/dsi/phy/dsi_phy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/dsi/phy/dsi_phy.c b/drivers/gpu/drm/msm/dsi/phy/dsi_phy.c index f0a2ddf96a4b9..ff7f2ec420300 100644 --- a/drivers/gpu/drm/msm/dsi/phy/dsi_phy.c +++ b/drivers/gpu/drm/msm/dsi/phy/dsi_phy.c @@ -843,7 +843,7 @@ int msm_dsi_phy_get_clk_provider(struct msm_dsi_phy *phy, if (pixel_clk_provider) *pixel_clk_provider = phy->provided_clocks->hws[DSI_PIXEL_PLL_CLK]->clk; - return -EINVAL; + return 0; } void msm_dsi_phy_pll_save_state(struct msm_dsi_phy *phy) From adbd914dcde0b03bfc08ffe40b81f31b0457833f Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 21 Apr 2021 14:31:50 +0100 Subject: [PATCH 007/263] btrfs: zoned: fix silent data loss after failure splitting ordered extent On a zoned filesystem, sometimes we need to split an ordered extent into 3 different ordered extents. The original ordered extent is shortened, at the front and at the rear, and we create two other new ordered extents to represent the trimmed parts of the original ordered extent. After adjusting the original ordered extent, we create an ordered extent to represent the pre-range, and that may fail with ENOMEM for example. After that we always try to create the ordered extent for the post-range, and if that happens to succeed we end up returning success to the caller as we overwrite the 'ret' variable which contained the previous error. This means we end up with a file range for which there is no ordered extent, which results in the range never getting a new file extent item pointing to the new data location. And since the split operation did not return an error, writeback does not fail and the inode's mapping is not flagged with an error, resulting in a subsequent fsync not reporting an error either. It's possibly very unlikely to have the creation of the post-range ordered extent succeed after the creation of the pre-range ordered extent failed, but it's not impossible. So fix this by making sure we only create the post-range ordered extent if there was no error creating the ordered extent for the pre-range. Fixes: d22002fd37bd97 ("btrfs: zoned: split ordered extent when bio is sent") CC: stable@vger.kernel.org # 5.12+ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ordered-data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 07b0b42187913..6c413bb451a3d 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -984,7 +984,7 @@ int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre, if (pre) ret = clone_ordered_extent(ordered, 0, pre); - if (post) + if (ret == 0 && post) ret = clone_ordered_extent(ordered, pre + ordered->disk_num_bytes, post); From ffb7c2e923cb3232454a513dcb5636e73091aa88 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 22 Apr 2021 12:09:21 +0100 Subject: [PATCH 008/263] btrfs: do not consider send context as valid when trying to flush qgroups At qgroup.c:try_flush_qgroup() we are asserting that current->journal_info is either NULL or has the value BTRFS_SEND_TRANS_STUB. However allowing for BTRFS_SEND_TRANS_STUB makes no sense because: 1) It is misleading, because send operations are read-only and do not ever need to reserve qgroup space; 2) We already assert that current->journal_info != BTRFS_SEND_TRANS_STUB at transaction.c:start_transaction(); 3) On a kernel without CONFIG_BTRFS_ASSERT=y set, it would result in a crash if try_flush_qgroup() is ever called in a send context, because at transaction.c:start_transaction we cast current->journal_info into a struct btrfs_trans_handle pointer and then dereference it. So just do allow a send context at try_flush_qgroup() and update the comment about it. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 2319c923c9e69..b1caf5acf1e2c 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3545,11 +3545,15 @@ static int try_flush_qgroup(struct btrfs_root *root) struct btrfs_trans_handle *trans; int ret; - /* Can't hold an open transaction or we run the risk of deadlocking */ - ASSERT(current->journal_info == NULL || - current->journal_info == BTRFS_SEND_TRANS_STUB); - if (WARN_ON(current->journal_info && - current->journal_info != BTRFS_SEND_TRANS_STUB)) + /* + * Can't hold an open transaction or we run the risk of deadlocking, + * and can't either be under the context of a send operation (where + * current->journal_info is set to BTRFS_SEND_TRANS_STUB), as that + * would result in a crash when starting a transaction and does not + * make sense either (send is a read-only operation). + */ + ASSERT(current->journal_info == NULL); + if (WARN_ON(current->journal_info)) return 0; /* From 626e9f41f7c281ba3e02843702f68471706aa6d9 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 27 Apr 2021 11:27:20 +0100 Subject: [PATCH 009/263] btrfs: fix race leading to unpersisted data and metadata on fsync When doing a fast fsync on a file, there is a race which can result in the fsync returning success to user space without logging the inode and without durably persisting new data. The following example shows one possible scenario for this: $ mkfs.btrfs -f /dev/sdc $ mount /dev/sdc /mnt $ touch /mnt/bar $ xfs_io -f -c "pwrite -S 0xab 0 1M" -c "fsync" /mnt/baz # Now we have: # file bar == inode 257 # file baz == inode 258 $ mv /mnt/baz /mnt/foo # Now we have: # file bar == inode 257 # file foo == inode 258 $ xfs_io -c "pwrite -S 0xcd 0 1M" /mnt/foo # fsync bar before foo, it is important to trigger the race. $ xfs_io -c "fsync" /mnt/bar $ xfs_io -c "fsync" /mnt/foo # After this: # inode 257, file bar, is empty # inode 258, file foo, has 1M filled with 0xcd # Replay the log: $ mount /dev/sdc /mnt # After this point file foo should have 1M filled with 0xcd and not 0xab The following steps explain how the race happens: 1) Before the first fsync of inode 258, when it has the "baz" name, its ->logged_trans is 0, ->last_sub_trans is 0 and ->last_log_commit is -1. The inode also has the full sync flag set; 2) After the first fsync, we set inode 258 ->logged_trans to 6, which is the generation of the current transaction, and set ->last_log_commit to 0, which is the current value of ->last_sub_trans (done at btrfs_log_inode()). The full sync flag is cleared from the inode during the fsync. The log sub transaction that was committed had an ID of 0 and when we synced the log, at btrfs_sync_log(), we incremented root->log_transid from 0 to 1; 3) During the rename: We update inode 258, through btrfs_update_inode(), and that causes its ->last_sub_trans to be set to 1 (the current log transaction ID), and ->last_log_commit remains with a value of 0. After updating inode 258, because we have previously logged the inode in the previous fsync, we log again the inode through the call to btrfs_log_new_name(). This results in updating the inode's ->last_log_commit from 0 to 1 (the current value of its ->last_sub_trans). The ->last_sub_trans of inode 257 is updated to 1, which is the ID of the next log transaction; 4) Then a buffered write against inode 258 is made. This leaves the value of ->last_sub_trans as 1 (the ID of the current log transaction, stored at root->log_transid); 5) Then an fsync against inode 257 (or any other inode other than 258), happens. This results in committing the log transaction with ID 1, which results in updating root->last_log_commit to 1 and bumping root->log_transid from 1 to 2; 6) Then an fsync against inode 258 starts. We flush delalloc and wait only for writeback to complete, since the full sync flag is not set in the inode's runtime flags - we do not wait for ordered extents to complete. Then, at btrfs_sync_file(), we call btrfs_inode_in_log() before the ordered extent completes. The call returns true: static inline bool btrfs_inode_in_log(...) { bool ret = false; spin_lock(&inode->lock); if (inode->logged_trans == generation && inode->last_sub_trans <= inode->last_log_commit && inode->last_sub_trans <= inode->root->last_log_commit) ret = true; spin_unlock(&inode->lock); return ret; } generation has a value of 6 (fs_info->generation), ->logged_trans also has a value of 6 (set when we logged the inode during the first fsync and when logging it during the rename), ->last_sub_trans has a value of 1, set during the rename (step 3), ->last_log_commit also has a value of 1 (set in step 3) and root->last_log_commit has a value of 1, which was set in step 5 when fsyncing inode 257. As a consequence we don't log the inode, any new extents and do not sync the log, resulting in a data loss if a power failure happens after the fsync and before the current transaction commits. Also, because we do not log the inode, after a power failure the mtime and ctime of the inode do not match those we had before. When the ordered extent completes before we call btrfs_inode_in_log(), then the call returns false and we log the inode and sync the log, since at the end of ordered extent completion we update the inode and set ->last_sub_trans to 2 (the value of root->log_transid) and ->last_log_commit to 1. This problem is found after removing the check for the emptiness of the inode's list of modified extents in the recent commit 209ecbb8585bf6 ("btrfs: remove stale comment and logic from btrfs_inode_in_log()"), added in the 5.13 merge window. However checking the emptiness of the list is not really the way to solve this problem, and was never intended to, because while that solves the problem for COW writes, the problem persists for NOCOW writes because in that case the list is always empty. In the case of NOCOW writes, even though we wait for the writeback to complete before returning from btrfs_sync_file(), we end up not logging the inode, which has a new mtime/ctime, and because we don't sync the log, we never issue disk barriers (send REQ_PREFLUSH to the device) since that only happens when we sync the log (when we write super blocks at btrfs_sync_log()). So effectively, for a NOCOW case, when we return from btrfs_sync_file() to user space, we are not guaranteeing that the data is durably persisted on disk. Also, while the example above uses a rename exchange to show how the problem happens, it is not the only way to trigger it. An alternative could be adding a new hard link to inode 258, since that also results in calling btrfs_log_new_name() and updating the inode in the log. An example reproducer using the addition of a hard link instead of a rename operation: $ mkfs.btrfs -f /dev/sdc $ mount /dev/sdc /mnt $ touch /mnt/bar $ xfs_io -f -c "pwrite -S 0xab 0 1M" -c "fsync" /mnt/foo $ ln /mnt/foo /mnt/foo_link $ xfs_io -c "pwrite -S 0xcd 0 1M" /mnt/foo $ xfs_io -c "fsync" /mnt/bar $ xfs_io -c "fsync" /mnt/foo # Replay the log: $ mount /dev/sdc /mnt # After this point file foo often has 1M filled with 0xab and not 0xcd The reasons leading to the final fsync of file foo, inode 258, not persisting the new data are the same as for the previous example with a rename operation. So fix by never skipping logging and log syncing when there are still any ordered extents in flight. To avoid making the conditional if statement that checks if logging an inode is needed harder to read, place all the logic into an helper function with separate if statements to make it more manageable and easier to read. A test case for fstests will follow soon. For NOCOW writes, the problem existed before commit b5e6c3e170b770 ("btrfs: always wait on ordered extents at fsync time"), introduced in kernel 4.19, then it went away with that commit since we started to always wait for ordered extent completion before logging. The problem came back again once the fast fsync path was changed again to avoid waiting for ordered extent completion, in commit 487781796d3022 ("btrfs: make fast fsyncs wait only for writeback"), added in kernel 5.10. However, for COW writes, the race only happens after the recent commit 209ecbb8585bf6 ("btrfs: remove stale comment and logic from btrfs_inode_in_log()"), introduced in the 5.13 merge window. For NOCOW writes, the bug existed before that commit. So tag 5.10+ as the release for stable backports. CC: stable@vger.kernel.org # 5.10+ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/file.c | 35 +++++++++++++++++++++++++---------- fs/btrfs/tree-log.c | 3 ++- 2 files changed, 27 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 864c08d08a353..3b10d98b4ebb3 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2067,6 +2067,30 @@ static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end) return ret; } +static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx) +{ + struct btrfs_inode *inode = BTRFS_I(ctx->inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + + if (btrfs_inode_in_log(inode, fs_info->generation) && + list_empty(&ctx->ordered_extents)) + return true; + + /* + * If we are doing a fast fsync we can not bail out if the inode's + * last_trans is <= then the last committed transaction, because we only + * update the last_trans of the inode during ordered extent completion, + * and for a fast fsync we don't wait for that, we only wait for the + * writeback to complete. + */ + if (inode->last_trans <= fs_info->last_trans_committed && + (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) || + list_empty(&ctx->ordered_extents))) + return true; + + return false; +} + /* * fsync call for both files and directories. This logs the inode into * the tree log instead of forcing full commits whenever possible. @@ -2185,17 +2209,8 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) atomic_inc(&root->log_batch); - /* - * If we are doing a fast fsync we can not bail out if the inode's - * last_trans is <= then the last committed transaction, because we only - * update the last_trans of the inode during ordered extent completion, - * and for a fast fsync we don't wait for that, we only wait for the - * writeback to complete. - */ smp_mb(); - if (btrfs_inode_in_log(BTRFS_I(inode), fs_info->generation) || - (BTRFS_I(inode)->last_trans <= fs_info->last_trans_committed && - (full_sync || list_empty(&ctx.ordered_extents)))) { + if (skip_inode_logging(&ctx)) { /* * We've had everything committed since the last time we were * modified so clear this flag in case it was set for whatever diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c1353b84ae54c..a0fc3a1390ab3 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -6060,7 +6060,8 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, * (since logging them is pointless, a link count of 0 means they * will never be accessible). */ - if (btrfs_inode_in_log(inode, trans->transid) || + if ((btrfs_inode_in_log(inode, trans->transid) && + list_empty(&ctx->ordered_extents)) || inode->vfs_inode.i_nlink == 0) { ret = BTRFS_NO_LOG_SYNC; goto end_no_trans; From f9baa501b4fd6962257853d46ddffbc21f27e344 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 22 Apr 2021 12:08:05 +0100 Subject: [PATCH 010/263] btrfs: fix deadlock when cloning inline extents and using qgroups There are a few exceptional cases where cloning an inline extent needs to copy the inline extent data into a page of the destination inode. When this happens, we end up starting a transaction while having a dirty page for the destination inode and while having the range locked in the destination's inode iotree too. Because when reserving metadata space for a transaction we may need to flush existing delalloc in case there is not enough free space, we have a mechanism in place to prevent a deadlock, which was introduced in commit 3d45f221ce627d ("btrfs: fix deadlock when cloning inline extent and low on free metadata space"). However when using qgroups, a transaction also reserves metadata qgroup space, which can also result in flushing delalloc in case there is not enough available space at the moment. When this happens we deadlock, since flushing delalloc requires locking the file range in the inode's iotree and the range was already locked at the very beginning of the clone operation, before attempting to start the transaction. When this issue happens, stack traces like the following are reported: [72747.556262] task:kworker/u81:9 state:D stack: 0 pid: 225 ppid: 2 flags:0x00004000 [72747.556268] Workqueue: writeback wb_workfn (flush-btrfs-1142) [72747.556271] Call Trace: [72747.556273] __schedule+0x296/0x760 [72747.556277] schedule+0x3c/0xa0 [72747.556279] io_schedule+0x12/0x40 [72747.556284] __lock_page+0x13c/0x280 [72747.556287] ? generic_file_readonly_mmap+0x70/0x70 [72747.556325] extent_write_cache_pages+0x22a/0x440 [btrfs] [72747.556331] ? __set_page_dirty_nobuffers+0xe7/0x160 [72747.556358] ? set_extent_buffer_dirty+0x5e/0x80 [btrfs] [72747.556362] ? update_group_capacity+0x25/0x210 [72747.556366] ? cpumask_next_and+0x1a/0x20 [72747.556391] extent_writepages+0x44/0xa0 [btrfs] [72747.556394] do_writepages+0x41/0xd0 [72747.556398] __writeback_single_inode+0x39/0x2a0 [72747.556403] writeback_sb_inodes+0x1ea/0x440 [72747.556407] __writeback_inodes_wb+0x5f/0xc0 [72747.556410] wb_writeback+0x235/0x2b0 [72747.556414] ? get_nr_inodes+0x35/0x50 [72747.556417] wb_workfn+0x354/0x490 [72747.556420] ? newidle_balance+0x2c5/0x3e0 [72747.556424] process_one_work+0x1aa/0x340 [72747.556426] worker_thread+0x30/0x390 [72747.556429] ? create_worker+0x1a0/0x1a0 [72747.556432] kthread+0x116/0x130 [72747.556435] ? kthread_park+0x80/0x80 [72747.556438] ret_from_fork+0x1f/0x30 [72747.566958] Workqueue: btrfs-flush_delalloc btrfs_work_helper [btrfs] [72747.566961] Call Trace: [72747.566964] __schedule+0x296/0x760 [72747.566968] ? finish_wait+0x80/0x80 [72747.566970] schedule+0x3c/0xa0 [72747.566995] wait_extent_bit.constprop.68+0x13b/0x1c0 [btrfs] [72747.566999] ? finish_wait+0x80/0x80 [72747.567024] lock_extent_bits+0x37/0x90 [btrfs] [72747.567047] btrfs_invalidatepage+0x299/0x2c0 [btrfs] [72747.567051] ? find_get_pages_range_tag+0x2cd/0x380 [72747.567076] __extent_writepage+0x203/0x320 [btrfs] [72747.567102] extent_write_cache_pages+0x2bb/0x440 [btrfs] [72747.567106] ? update_load_avg+0x7e/0x5f0 [72747.567109] ? enqueue_entity+0xf4/0x6f0 [72747.567134] extent_writepages+0x44/0xa0 [btrfs] [72747.567137] ? enqueue_task_fair+0x93/0x6f0 [72747.567140] do_writepages+0x41/0xd0 [72747.567144] __filemap_fdatawrite_range+0xc7/0x100 [72747.567167] btrfs_run_delalloc_work+0x17/0x40 [btrfs] [72747.567195] btrfs_work_helper+0xc2/0x300 [btrfs] [72747.567200] process_one_work+0x1aa/0x340 [72747.567202] worker_thread+0x30/0x390 [72747.567205] ? create_worker+0x1a0/0x1a0 [72747.567208] kthread+0x116/0x130 [72747.567211] ? kthread_park+0x80/0x80 [72747.567214] ret_from_fork+0x1f/0x30 [72747.569686] task:fsstress state:D stack: 0 pid:841421 ppid:841417 flags:0x00000000 [72747.569689] Call Trace: [72747.569691] __schedule+0x296/0x760 [72747.569694] schedule+0x3c/0xa0 [72747.569721] try_flush_qgroup+0x95/0x140 [btrfs] [72747.569725] ? finish_wait+0x80/0x80 [72747.569753] btrfs_qgroup_reserve_data+0x34/0x50 [btrfs] [72747.569781] btrfs_check_data_free_space+0x5f/0xa0 [btrfs] [72747.569804] btrfs_buffered_write+0x1f7/0x7f0 [btrfs] [72747.569810] ? path_lookupat.isra.48+0x97/0x140 [72747.569833] btrfs_file_write_iter+0x81/0x410 [btrfs] [72747.569836] ? __kmalloc+0x16a/0x2c0 [72747.569839] do_iter_readv_writev+0x160/0x1c0 [72747.569843] do_iter_write+0x80/0x1b0 [72747.569847] vfs_writev+0x84/0x140 [72747.569869] ? btrfs_file_llseek+0x38/0x270 [btrfs] [72747.569873] do_writev+0x65/0x100 [72747.569876] do_syscall_64+0x33/0x40 [72747.569879] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [72747.569899] task:fsstress state:D stack: 0 pid:841424 ppid:841417 flags:0x00004000 [72747.569903] Call Trace: [72747.569906] __schedule+0x296/0x760 [72747.569909] schedule+0x3c/0xa0 [72747.569936] try_flush_qgroup+0x95/0x140 [btrfs] [72747.569940] ? finish_wait+0x80/0x80 [72747.569967] __btrfs_qgroup_reserve_meta+0x36/0x50 [btrfs] [72747.569989] start_transaction+0x279/0x580 [btrfs] [72747.570014] clone_copy_inline_extent+0x332/0x490 [btrfs] [72747.570041] btrfs_clone+0x5b7/0x7a0 [btrfs] [72747.570068] ? lock_extent_bits+0x64/0x90 [btrfs] [72747.570095] btrfs_clone_files+0xfc/0x150 [btrfs] [72747.570122] btrfs_remap_file_range+0x3d8/0x4a0 [btrfs] [72747.570126] do_clone_file_range+0xed/0x200 [72747.570131] vfs_clone_file_range+0x37/0x110 [72747.570134] ioctl_file_clone+0x7d/0xb0 [72747.570137] do_vfs_ioctl+0x138/0x630 [72747.570140] __x64_sys_ioctl+0x62/0xc0 [72747.570143] do_syscall_64+0x33/0x40 [72747.570146] entry_SYSCALL_64_after_hwframe+0x44/0xa9 So fix this by skipping the flush of delalloc for an inode that is flagged with BTRFS_INODE_NO_DELALLOC_FLUSH, meaning it is currently under such a special case of cloning an inline extent, when flushing delalloc during qgroup metadata reservation. The special cases for cloning inline extents were added in kernel 5.7 by by commit 05a5a7621ce66c ("Btrfs: implement full reflink support for inline extents"), while having qgroup metadata space reservation flushing delalloc when low on space was added in kernel 5.9 by commit c53e9653605dbf ("btrfs: qgroup: try to flush qgroup space when we get -EDQUOT"). So use a "Fixes:" tag for the later commit to ease stable kernel backports. Reported-by: Wang Yugui Link: https://lore.kernel.org/linux-btrfs/20210421083137.31E3.409509F4@e16-tech.com/ Fixes: c53e9653605dbf ("btrfs: qgroup: try to flush qgroup space when we get -EDQUOT") CC: stable@vger.kernel.org # 5.9+ Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/inode.c | 4 ++-- fs/btrfs/ioctl.c | 2 +- fs/btrfs/qgroup.c | 2 +- fs/btrfs/send.c | 4 ++-- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 278e0cbc9a98b..0f5b0b12762bb 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3127,7 +3127,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, u64 new_size, u32 min_type); -int btrfs_start_delalloc_snapshot(struct btrfs_root *root); +int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context); int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, bool in_reclaim_context); int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1a349759efae4..69fcdf8f0b1c2 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9691,7 +9691,7 @@ static int start_delalloc_inodes(struct btrfs_root *root, return ret; } -int btrfs_start_delalloc_snapshot(struct btrfs_root *root) +int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context) { struct writeback_control wbc = { .nr_to_write = LONG_MAX, @@ -9704,7 +9704,7 @@ int btrfs_start_delalloc_snapshot(struct btrfs_root *root) if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) return -EROFS; - return start_delalloc_inodes(root, &wbc, true, false); + return start_delalloc_inodes(root, &wbc, true, in_reclaim_context); } int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index b1328f17607ea..0ba0e4ddaf6b4 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1051,7 +1051,7 @@ static noinline int btrfs_mksnapshot(const struct path *parent, */ btrfs_drew_read_lock(&root->snapshot_lock); - ret = btrfs_start_delalloc_snapshot(root); + ret = btrfs_start_delalloc_snapshot(root, false); if (ret) goto out; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index b1caf5acf1e2c..3ded812f522cc 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3566,7 +3566,7 @@ static int try_flush_qgroup(struct btrfs_root *root) return 0; } - ret = btrfs_start_delalloc_snapshot(root); + ret = btrfs_start_delalloc_snapshot(root, true); if (ret < 0) goto out; btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 55741adf90712..bd69db72acc5e 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -7170,7 +7170,7 @@ static int flush_delalloc_roots(struct send_ctx *sctx) int i; if (root) { - ret = btrfs_start_delalloc_snapshot(root); + ret = btrfs_start_delalloc_snapshot(root, false); if (ret) return ret; btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX); @@ -7178,7 +7178,7 @@ static int flush_delalloc_roots(struct send_ctx *sctx) for (i = 0; i < sctx->clone_roots_cnt; i++) { root = sctx->clone_roots[i].root; - ret = btrfs_start_delalloc_snapshot(root); + ret = btrfs_start_delalloc_snapshot(root, false); if (ret) return ret; btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX); From 02ded1314a465a89267be38231d9858206853d80 Mon Sep 17 00:00:00 2001 From: Jonathan Marek Date: Fri, 23 Apr 2021 15:04:20 -0400 Subject: [PATCH 011/263] drm/msm: fix minor version to indicate MSM_PARAM_SUSPENDS support Increase the minor version to indicate that MSM_PARAM_SUSPENDS is supported. Fixes: 3ab1c5cc3939 ("drm/msm: Add param for userspace to query suspend count") Signed-off-by: Jonathan Marek Link: https://lore.kernel.org/r/20210423190420.25217-1-jonathan@marek.ca Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/msm_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c index e1104d2454e2e..fe7d17cd35ecd 100644 --- a/drivers/gpu/drm/msm/msm_drv.c +++ b/drivers/gpu/drm/msm/msm_drv.c @@ -42,7 +42,7 @@ * - 1.7.0 - Add MSM_PARAM_SUSPENDS to access suspend count */ #define MSM_VERSION_MAJOR 1 -#define MSM_VERSION_MINOR 6 +#define MSM_VERSION_MINOR 7 #define MSM_VERSION_PATCHLEVEL 0 static const struct drm_mode_config_funcs mode_config_funcs = { From ff76d506030daeeeb967be8b8a189bf7aee8e7a8 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Thu, 29 Apr 2021 16:12:26 +1200 Subject: [PATCH 012/263] KVM: x86/mmu: Avoid unnecessary page table allocation in kvm_tdp_mmu_map() In kvm_tdp_mmu_map(), while iterating TDP MMU page table entries, it is possible SPTE has already been frozen by another thread but the frozen is not done yet, for instance, when another thread is still in middle of zapping large page. In this case, the !is_shadow_present_pte() check for old SPTE in tdp_mmu_for_each_pte() may hit true, and in this case allocating new page table is unnecessary since tdp_mmu_set_spte_atomic() later will return false and page table will need to be freed. Add is_removed_spte() check before allocating new page table to avoid this. Signed-off-by: Kai Huang Message-Id: <20210429041226.50279-1-kai.huang@intel.com> Reviewed-by: Ben Gardon Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/tdp_mmu.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 88f69a6cc4922..3c8284841bed4 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1009,6 +1009,14 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, } if (!is_shadow_present_pte(iter.old_spte)) { + /* + * If SPTE has been forzen by another thread, just + * give up and retry, avoiding unnecessary page table + * allocation and free. + */ + if (is_removed_spte(iter.old_spte)) + break; + sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level); child_pt = sp->spt; From 1699f65c8b658d434fe92563c906cd1a136c9cb6 Mon Sep 17 00:00:00 2001 From: "Shahin, Md Shahadat Hossain" Date: Fri, 30 Apr 2021 11:52:31 +0000 Subject: [PATCH 013/263] kvm/x86: Fix 'lpages' kvm stat for TDM MMU Large pages not being created properly may result in increased memory access time. The 'lpages' kvm stat used to keep track of the current number of large pages in the system, but with TDP MMU enabled the stat is not showing the correct number. This patch extends the lpages counter to cover the TDP case. Signed-off-by: Md Shahadat Hossain Shahin Cc: Bartosz Szczepanek Message-Id: <1619783551459.35424@amazon.de> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/tdp_mmu.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 3c8284841bed4..c743894fe0b7c 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -444,6 +444,13 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); + if (is_large_pte(old_spte) != is_large_pte(new_spte)) { + if (is_large_pte(old_spte)) + atomic64_sub(1, (atomic64_t*)&kvm->stat.lpages); + else + atomic64_add(1, (atomic64_t*)&kvm->stat.lpages); + } + /* * The only times a SPTE should be changed from a non-present to * non-present state is when an MMIO entry is installed/modified/ From d981dd15498b188636ec5a7d8ad485e650f63d8d Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 28 Apr 2021 19:08:02 +0800 Subject: [PATCH 014/263] KVM: LAPIC: Accurately guarantee busy wait for timer to expire when using hv_timer Commit ee66e453db13d (KVM: lapic: Busy wait for timer to expire when using hv_timer) tries to set ktime->expired_tscdeadline by checking ktime->hv_timer_in_use since lapic timer oneshot/periodic modes which are emulated by vmx preemption timer also get advanced, they leverage the same vmx preemption timer logic with tsc-deadline mode. However, ktime->hv_timer_in_use is cleared before apic_timer_expired() handling, let's delay this clearing in preemption-disabled region. Fixes: ee66e453db13d ("KVM: lapic: Busy wait for timer to expire when using hv_timer") Reviewed-by: Sean Christopherson Signed-off-by: Wanpeng Li Message-Id: <1619608082-4187-1-git-send-email-wanpengli@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 152591f9243ab..c0ebef560bd14 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1913,8 +1913,8 @@ void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) if (!apic->lapic_timer.hv_timer_in_use) goto out; WARN_ON(rcuwait_active(&vcpu->wait)); - cancel_hv_timer(apic); apic_timer_expired(apic, false); + cancel_hv_timer(apic); if (apic_lvtt_period(apic) && apic->lapic_timer.period) { advance_periodic_target_expiration(apic); From 262de4102c7bb8e59f26a967a8ffe8cce85cc537 Mon Sep 17 00:00:00 2001 From: Benjamin Segall Date: Thu, 29 Apr 2021 16:22:34 +0000 Subject: [PATCH 015/263] kvm: exit halt polling on need_resched() as well single_task_running() is usually more general than need_resched() but CFS_BANDWIDTH throttling will use resched_task() when there is just one task to get the task to block. This was causing long-need_resched warnings and was likely allowing VMs to overrun their quota when halt polling. Signed-off-by: Ben Segall Signed-off-by: Venkatesh Srinivas Message-Id: <20210429162233.116849-1-venkateshs@chromium.org> Signed-off-by: Paolo Bonzini Cc: stable@vger.kernel.org Reviewed-by: Jim Mattson --- virt/kvm/kvm_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 2799c6660ccea..b9f12da6af0ea 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2973,7 +2973,8 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) goto out; } poll_end = cur = ktime_get(); - } while (single_task_running() && ktime_before(cur, stop)); + } while (single_task_running() && !need_resched() && + ktime_before(cur, stop)); } prepare_to_rcuwait(&vcpu->wait); From deee59bacb2402c20e6b1b6800f9a5127367eb2a Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Mon, 3 May 2021 15:54:42 +0300 Subject: [PATCH 016/263] KVM: nSVM: fix a typo in svm_leave_nested When forcibly leaving the nested mode, we should switch to vmcb01 Fixes: 4995a3685f1b ("KVM: SVM: Use a separate vmcb for the nested L2 guest") Signed-off-by: Maxim Levitsky Message-Id: <20210503125446.1353307-2-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 540d43ba2cf46..3321220f3deb7 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -886,7 +886,7 @@ void svm_leave_nested(struct vcpu_svm *svm) svm->nested.nested_run_pending = 0; leave_guest_mode(vcpu); - svm_switch_vmcb(svm, &svm->nested.vmcb02); + svm_switch_vmcb(svm, &svm->vmcb01); nested_svm_uninit_mmu_context(vcpu); vmcb_mark_all_dirty(svm->vmcb); From c74ad08f3333db2e44d3346b863f6d10d35e37dd Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Mon, 3 May 2021 15:54:43 +0300 Subject: [PATCH 017/263] KVM: nSVM: fix few bugs in the vmcb02 caching logic * Define and use an invalid GPA (all ones) for init value of last and current nested vmcb physical addresses. * Reset the current vmcb12 gpa to the invalid value when leaving the nested mode, similar to what is done on nested vmexit. * Reset the last seen vmcb12 address when disabling the nested SVM, as it relies on vmcb02 fields which are freed at that point. Fixes: 4995a3685f1b ("KVM: SVM: Use a separate vmcb for the nested L2 guest") Signed-off-by: Maxim Levitsky Message-Id: <20210503125446.1353307-3-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm/nested.c | 11 +++++++++++ arch/x86/kvm/svm/svm.c | 4 ++-- 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index cbbcee0a84f92..848956bb3cf1d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -113,6 +113,7 @@ #define VALID_PAGE(x) ((x) != INVALID_PAGE) #define UNMAPPED_GVA (~(gpa_t)0) +#define INVALID_GPA (~(gpa_t)0) /* KVM Hugepage definitions for x86 */ #define KVM_MAX_HUGEPAGE_LEVEL PG_LEVEL_1G diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 3321220f3deb7..a88c64e004c3d 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -872,6 +872,15 @@ void svm_free_nested(struct vcpu_svm *svm) __free_page(virt_to_page(svm->nested.vmcb02.ptr)); svm->nested.vmcb02.ptr = NULL; + /* + * When last_vmcb12_gpa matches the current vmcb12 gpa, + * some vmcb12 fields are not loaded if they are marked clean + * in the vmcb12, since in this case they are up to date already. + * + * When the vmcb02 is freed, this optimization becomes invalid. + */ + svm->nested.last_vmcb12_gpa = INVALID_GPA; + svm->nested.initialized = false; } @@ -884,6 +893,8 @@ void svm_leave_nested(struct vcpu_svm *svm) if (is_guest_mode(vcpu)) { svm->nested.nested_run_pending = 0; + svm->nested.vmcb12_gpa = INVALID_GPA; + leave_guest_mode(vcpu); svm_switch_vmcb(svm, &svm->vmcb01); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9790c73f2a325..be5cf612ab1fe 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1235,8 +1235,8 @@ static void init_vmcb(struct kvm_vcpu *vcpu) svm->current_vmcb->asid_generation = 0; svm->asid = 0; - svm->nested.vmcb12_gpa = 0; - svm->nested.last_vmcb12_gpa = 0; + svm->nested.vmcb12_gpa = INVALID_GPA; + svm->nested.last_vmcb12_gpa = INVALID_GPA; vcpu->arch.hflags = 0; if (!kvm_pause_in_guest(vcpu->kvm)) { From 9d290e16432cacd448475d38dec2753b75b9665f Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Mon, 3 May 2021 15:54:44 +0300 Subject: [PATCH 018/263] KVM: nSVM: leave the guest mode prior to loading a nested state This allows the KVM to load the nested state more than once without warnings. Signed-off-by: Maxim Levitsky Message-Id: <20210503125446.1353307-4-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index a88c64e004c3d..32400cba608d4 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -1309,12 +1309,15 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, * L2 registers if needed are moved from the current VMCB to VMCB02. */ + if (is_guest_mode(vcpu)) + svm_leave_nested(svm); + else + svm->nested.vmcb02.ptr->save = svm->vmcb01.ptr->save; + svm->nested.nested_run_pending = !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa; - if (svm->current_vmcb == &svm->vmcb01) - svm->nested.vmcb02.ptr->save = svm->vmcb01.ptr->save; svm->vmcb01.ptr->save.es = save->es; svm->vmcb01.ptr->save.cs = save->cs; From 7f6231a39117c2781beead59d6ae4923c2703147 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Mon, 3 May 2021 16:24:46 +1200 Subject: [PATCH 019/263] KVM: x86/mmu: Fix kdoc of __handle_changed_spte The function name of kdoc of __handle_changed_spte() should be itself, rather than handle_changed_spte(). Fix the typo. Signed-off-by: Kai Huang Message-Id: <20210503042446.154695-1-kai.huang@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/tdp_mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index c743894fe0b7c..95eeb5ac6a8a7 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -388,7 +388,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, } /** - * handle_changed_spte - handle bookkeeping associated with an SPTE change + * __handle_changed_spte - handle bookkeeping associated with an SPTE change * @kvm: kvm instance * @as_id: the address space of the paging structure the SPTE was a part of * @gfn: the base GFN that was mapped by the SPTE From 8899a5fc7da516460f841189a28aac0b52b554fd Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 30 Apr 2021 18:03:03 +0100 Subject: [PATCH 020/263] KVM: x86: Fix potential fput on a null source_kvm_file The fget can potentially return null, so the fput on the error return path can cause a null pointer dereference. Fix this by checking for a null source_kvm_file before doing a fput. Addresses-Coverity: ("Dereference null return") Fixes: 54526d1fd593 ("KVM: x86: Support KVM VMs sharing SEV context") Signed-off-by: Colin Ian King Message-Id: <20210430170303.131924-1-colin.king@canonical.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 1356ee095cd55..8b11c711a0e40 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1764,7 +1764,8 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd) e_source_unlock: mutex_unlock(&source_kvm->lock); e_source_put: - fput(source_kvm_file); + if (source_kvm_file) + fput(source_kvm_file); return ret; } From 5e753a817b2d5991dfe8a801b7b1e8e79a1c5a20 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Fri, 30 Apr 2021 19:59:51 +0800 Subject: [PATCH 021/263] btrfs: fix unmountable seed device after fstrim The following test case reproduces an issue of wrongly freeing in-use blocks on the readonly seed device when fstrim is called on the rw sprout device. As shown below. Create a seed device and add a sprout device to it: $ mkfs.btrfs -fq -dsingle -msingle /dev/loop0 $ btrfstune -S 1 /dev/loop0 $ mount /dev/loop0 /btrfs $ btrfs dev add -f /dev/loop1 /btrfs BTRFS info (device loop0): relocating block group 290455552 flags system BTRFS info (device loop0): relocating block group 1048576 flags system BTRFS info (device loop0): disk added /dev/loop1 $ umount /btrfs Mount the sprout device and run fstrim: $ mount /dev/loop1 /btrfs $ fstrim /btrfs $ umount /btrfs Now try to mount the seed device, and it fails: $ mount /dev/loop0 /btrfs mount: /btrfs: wrong fs type, bad option, bad superblock on /dev/loop0, missing codepage or helper program, or other error. Block 5292032 is missing on the readonly seed device: $ dmesg -kt | tail BTRFS error (device loop0): bad tree block start, want 5292032 have 0 BTRFS warning (device loop0): couldn't read-tree root BTRFS error (device loop0): open_ctree failed From the dump-tree of the seed device (taken before the fstrim). Block 5292032 belonged to the block group starting at 5242880: $ btrfs inspect dump-tree -e /dev/loop0 | grep -A1 BLOCK_GROUP item 3 key (5242880 BLOCK_GROUP_ITEM 8388608) itemoff 16169 itemsize 24 block group used 114688 chunk_objectid 256 flags METADATA From the dump-tree of the sprout device (taken before the fstrim). fstrim used block-group 5242880 to find the related free space to free: $ btrfs inspect dump-tree -e /dev/loop1 | grep -A1 BLOCK_GROUP item 1 key (5242880 BLOCK_GROUP_ITEM 8388608) itemoff 16226 itemsize 24 block group used 32768 chunk_objectid 256 flags METADATA BPF kernel tracing the fstrim command finds the missing block 5292032 within the range of the discarded blocks as below: kprobe:btrfs_discard_extent { printf("freeing start %llu end %llu num_bytes %llu:\n", arg1, arg1+arg2, arg2); } freeing start 5259264 end 5406720 num_bytes 147456 Fix this by avoiding the discard command to the readonly seed device. Reported-by: Chris Murphy CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Filipe Manana Signed-off-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 7a28314189b4a..f1d15b68994a0 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1340,12 +1340,16 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, stripe = bbio->stripes; for (i = 0; i < bbio->num_stripes; i++, stripe++) { u64 bytes; + struct btrfs_device *device = stripe->dev; - if (!stripe->dev->bdev) { + if (!device->bdev) { ASSERT(btrfs_test_opt(fs_info, DEGRADED)); continue; } + if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) + continue; + ret = do_discard_extent(stripe, &bytes); if (!ret) { discarded_bytes += bytes; From 784daf2b9628f2d0117f1f0b578cfe5ab6634919 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Fri, 30 Apr 2021 15:34:17 +0200 Subject: [PATCH 022/263] btrfs: zoned: sanity check zone type The fstests test case generic/475 creates a dm-linear device that gets changed to a dm-error device. This leads to errors in loading the block group's zone information when running on a zoned file system, ultimately resulting in a list corruption. When running on a kernel with list debugging enabled this leads to the following crash. BTRFS: error (device dm-2) in cleanup_transaction:1953: errno=-5 IO failure kernel BUG at lib/list_debug.c:54! invalid opcode: 0000 [#1] SMP PTI CPU: 1 PID: 2433 Comm: umount Tainted: G W 5.12.0+ #1018 RIP: 0010:__list_del_entry_valid.cold+0x1d/0x47 RSP: 0018:ffffc90001473df0 EFLAGS: 00010296 RAX: 0000000000000054 RBX: ffff8881038fd000 RCX: ffffc90001473c90 RDX: 0000000100001a31 RSI: 0000000000000003 RDI: 0000000000000003 RBP: ffff888308871108 R08: 0000000000000003 R09: 0000000000000001 R10: 3961373532383838 R11: 6666666620736177 R12: ffff888308871000 R13: ffff8881038fd088 R14: ffff8881038fdc78 R15: dead000000000100 FS: 00007f353c9b1540(0000) GS:ffff888627d00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f353cc2c710 CR3: 000000018e13c000 CR4: 00000000000006a0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: btrfs_free_block_groups+0xc9/0x310 [btrfs] close_ctree+0x2ee/0x31a [btrfs] ? call_rcu+0x8f/0x270 ? mutex_lock+0x1c/0x40 generic_shutdown_super+0x67/0x100 kill_anon_super+0x14/0x30 btrfs_kill_super+0x12/0x20 [btrfs] deactivate_locked_super+0x31/0x90 cleanup_mnt+0x13e/0x1b0 task_work_run+0x63/0xb0 exit_to_user_mode_loop+0xd9/0xe0 exit_to_user_mode_prepare+0x3e/0x60 syscall_exit_to_user_mode+0x1d/0x50 entry_SYSCALL_64_after_hwframe+0x44/0xae As dm-error has no support for zones, btrfs will run it's zone emulation mode on this device. The zone emulation mode emulates conventional zones, so bail out if the zone bitmap that gets populated on mount sees the zone as sequential while we're thinking it's a conventional zone when creating a block group. Note: this scenario is unlikely in a real wold application and can only happen by this (ab)use of device-mapper targets. CC: stable@vger.kernel.org # 5.12+ Signed-off-by: Naohiro Aota Signed-off-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 70b23a0d03b10..304ce64c70a44 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1126,6 +1126,11 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) goto out; } + if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) { + ret = -EIO; + goto out; + } + switch (zone.cond) { case BLK_ZONE_COND_OFFLINE: case BLK_ZONE_COND_READONLY: From 77364faf21b4105ee5adbb4844fdfb461334d249 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Fri, 30 Apr 2021 11:06:55 -0700 Subject: [PATCH 023/263] btrfs: initialize return variable in cleanup_free_space_cache_v1 Static analysis reports this problem free-space-cache.c:3965:2: warning: Undefined or garbage value returned return ret; ^~~~~~~~~~ ret is set in the node handling loop. Treat doing nothing as a success and initialize ret to 0, although it's unlikely the loop would be skipped. We always have block groups, but as it could lead to transaction abort in the caller it's better to be safe. CC: stable@vger.kernel.org # 5.12+ Signed-off-by: Tom Rix Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/free-space-cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index e54466fc101f7..4806295116d88 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -3949,7 +3949,7 @@ static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info, { struct btrfs_block_group *block_group; struct rb_node *node; - int ret; + int ret = 0; btrfs_info(fs_info, "cleaning free space cache v1"); From 0a269a008f837e76ce285679ab3005059fadc2a6 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 14 Apr 2021 14:35:40 +0200 Subject: [PATCH 024/263] x86/kvm: Fix pr_info() for async PF setup/teardown 'pr_fmt' already has 'kvm-guest: ' so 'KVM' prefix is redundant. "Unregister pv shared memory" is very ambiguous, it's hard to say which particular PV feature it relates to. Signed-off-by: Vitaly Kuznetsov Message-Id: <20210414123544.1060604-2-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index d307c22e5c188..dc440bb692223 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -345,7 +345,7 @@ static void kvm_guest_cpu_init(void) wrmsrl(MSR_KVM_ASYNC_PF_EN, pa); __this_cpu_write(apf_reason.enabled, 1); - pr_info("KVM setup async PF for cpu %d\n", smp_processor_id()); + pr_info("setup async PF for cpu %d\n", smp_processor_id()); } if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) { @@ -371,7 +371,7 @@ static void kvm_pv_disable_apf(void) wrmsrl(MSR_KVM_ASYNC_PF_EN, 0); __this_cpu_write(apf_reason.enabled, 0); - pr_info("Unregister pv shared memory for cpu %d\n", smp_processor_id()); + pr_info("disable async PF for cpu %d\n", smp_processor_id()); } static void kvm_pv_guest_cpu_reboot(void *unused) From d9aa6571b28ba0022de1e48801ff03a1854c7ef2 Mon Sep 17 00:00:00 2001 From: Kuogee Hsieh Date: Wed, 21 Apr 2021 16:37:35 -0700 Subject: [PATCH 025/263] drm/msm/dp: check sink_count before update is_connected status Link status is different from display connected status in the case of something like an Apple dongle where the type-c plug can be connected, and therefore the link is connected, but no sink is connected until an HDMI cable is plugged into the dongle. The sink_count of DPCD of dongle will increase to 1 once an HDMI cable is plugged into the dongle so that display connected status will become true. This checking also apply at pm_resume. Changes in v4: -- none Fixes: 94e58e2d06e3 ("drm/msm/dp: reset dp controller only at boot up and pm_resume") Reported-by: Stephen Boyd Reviewed-by: Stephen Boyd Tested-by: Stephen Boyd Signed-off-by: Kuogee Hsieh Fixes: 8ede2ecc3e5e ("drm/msm/dp: Add DP compliance tests on Snapdragon Chipsets") Link: https://lore.kernel.org/r/1619048258-8717-2-git-send-email-khsieh@codeaurora.org Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/dp/dp_display.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/msm/dp/dp_display.c b/drivers/gpu/drm/msm/dp/dp_display.c index 5a39da6e1eaf2..0ba71c7a8dd4d 100644 --- a/drivers/gpu/drm/msm/dp/dp_display.c +++ b/drivers/gpu/drm/msm/dp/dp_display.c @@ -586,10 +586,8 @@ static int dp_connect_pending_timeout(struct dp_display_private *dp, u32 data) mutex_lock(&dp->event_mutex); state = dp->hpd_state; - if (state == ST_CONNECT_PENDING) { - dp_display_enable(dp, 0); + if (state == ST_CONNECT_PENDING) dp->hpd_state = ST_CONNECTED; - } mutex_unlock(&dp->event_mutex); @@ -669,10 +667,8 @@ static int dp_disconnect_pending_timeout(struct dp_display_private *dp, u32 data mutex_lock(&dp->event_mutex); state = dp->hpd_state; - if (state == ST_DISCONNECT_PENDING) { - dp_display_disable(dp, 0); + if (state == ST_DISCONNECT_PENDING) dp->hpd_state = ST_DISCONNECTED; - } mutex_unlock(&dp->event_mutex); @@ -1272,7 +1268,12 @@ static int dp_pm_resume(struct device *dev) status = dp_catalog_link_is_connected(dp->catalog); - if (status) + /* + * can not declared display is connected unless + * HDMI cable is plugged in and sink_count of + * dongle become 1 + */ + if (status && dp->link->sink_count) dp->dp_display.is_connected = true; else dp->dp_display.is_connected = false; From f2f46b878777e0d3f885c7ddad48f477b4dea247 Mon Sep 17 00:00:00 2001 From: Kuogee Hsieh Date: Wed, 21 Apr 2021 16:37:36 -0700 Subject: [PATCH 026/263] drm/msm/dp: initialize audio_comp when audio starts Initialize audio_comp when audio starts and wait for audio_comp at dp_display_disable(). This will take care of both dongle unplugged and display off (suspend) cases. Changes in v2: -- add dp_display_signal_audio_start() Changes in v3: -- restore dp_display_handle_plugged_change() at dp_hpd_unplug_handle(). Changes in v4: -- none Signed-off-by: Kuogee Hsieh Reviewed-by: Stephen Boyd Tested-by: Stephen Boyd Fixes: c703d5789590 ("drm/msm/dp: trigger unplug event in msm_dp_display_disable") Link: https://lore.kernel.org/r/1619048258-8717-3-git-send-email-khsieh@codeaurora.org Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/dp/dp_audio.c | 1 + drivers/gpu/drm/msm/dp/dp_display.c | 11 +++++++++-- drivers/gpu/drm/msm/dp/dp_display.h | 1 + 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/msm/dp/dp_audio.c b/drivers/gpu/drm/msm/dp/dp_audio.c index 82a8673ab8daf..d7e4a39a904e2 100644 --- a/drivers/gpu/drm/msm/dp/dp_audio.c +++ b/drivers/gpu/drm/msm/dp/dp_audio.c @@ -527,6 +527,7 @@ int dp_audio_hw_params(struct device *dev, dp_audio_setup_acr(audio); dp_audio_safe_to_exit_level(audio); dp_audio_enable(audio, true); + dp_display_signal_audio_start(dp_display); dp_display->audio_enabled = true; end: diff --git a/drivers/gpu/drm/msm/dp/dp_display.c b/drivers/gpu/drm/msm/dp/dp_display.c index 0ba71c7a8dd4d..1784e119269b7 100644 --- a/drivers/gpu/drm/msm/dp/dp_display.c +++ b/drivers/gpu/drm/msm/dp/dp_display.c @@ -178,6 +178,15 @@ static int dp_del_event(struct dp_display_private *dp_priv, u32 event) return 0; } +void dp_display_signal_audio_start(struct msm_dp *dp_display) +{ + struct dp_display_private *dp; + + dp = container_of(dp_display, struct dp_display_private, dp_display); + + reinit_completion(&dp->audio_comp); +} + void dp_display_signal_audio_complete(struct msm_dp *dp_display) { struct dp_display_private *dp; @@ -649,7 +658,6 @@ static int dp_hpd_unplug_handle(struct dp_display_private *dp, u32 data) dp_add_event(dp, EV_DISCONNECT_PENDING_TIMEOUT, 0, DP_TIMEOUT_5_SECOND); /* signal the disconnect event early to ensure proper teardown */ - reinit_completion(&dp->audio_comp); dp_display_handle_plugged_change(g_dp_display, false); dp_catalog_hpd_config_intr(dp->catalog, DP_DP_HPD_PLUG_INT_MASK | @@ -894,7 +902,6 @@ static int dp_display_disable(struct dp_display_private *dp, u32 data) /* wait only if audio was enabled */ if (dp_display->audio_enabled) { /* signal the disconnect event */ - reinit_completion(&dp->audio_comp); dp_display_handle_plugged_change(dp_display, false); if (!wait_for_completion_timeout(&dp->audio_comp, HZ * 5)) diff --git a/drivers/gpu/drm/msm/dp/dp_display.h b/drivers/gpu/drm/msm/dp/dp_display.h index 6092ba1ed85ed..5173c89eedf7e 100644 --- a/drivers/gpu/drm/msm/dp/dp_display.h +++ b/drivers/gpu/drm/msm/dp/dp_display.h @@ -34,6 +34,7 @@ int dp_display_get_modes(struct msm_dp *dp_display, int dp_display_request_irq(struct msm_dp *dp_display); bool dp_display_check_video_test(struct msm_dp *dp_display); int dp_display_get_test_bpp(struct msm_dp *dp_display); +void dp_display_signal_audio_start(struct msm_dp *dp_display); void dp_display_signal_audio_complete(struct msm_dp *dp_display); #endif /* _DP_DISPLAY_H_ */ From 8b79feffeca28c5459458fe78676b081e87c93a4 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 14 Apr 2021 14:35:41 +0200 Subject: [PATCH 027/263] x86/kvm: Teardown PV features on boot CPU as well Various PV features (Async PF, PV EOI, steal time) work through memory shared with hypervisor and when we restore from hibernation we must properly teardown all these features to make sure hypervisor doesn't write to stale locations after we jump to the previously hibernated kernel (which can try to place anything there). For secondary CPUs the job is already done by kvm_cpu_down_prepare(), register syscore ops to do the same for boot CPU. Signed-off-by: Vitaly Kuznetsov Message-Id: <20210414123544.1060604-3-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvm.c | 56 ++++++++++++++++++++++++++++++------------- 1 file changed, 40 insertions(+), 16 deletions(-) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index dc440bb692223..9d5f96321c7f9 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -451,6 +452,25 @@ static void __init sev_map_percpu_data(void) } } +static void kvm_guest_cpu_offline(void) +{ + kvm_disable_steal_time(); + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) + wrmsrl(MSR_KVM_PV_EOI_EN, 0); + kvm_pv_disable_apf(); + apf_task_wake_all(); +} + +static int kvm_cpu_online(unsigned int cpu) +{ + unsigned long flags; + + local_irq_save(flags); + kvm_guest_cpu_init(); + local_irq_restore(flags); + return 0; +} + #ifdef CONFIG_SMP static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask); @@ -635,32 +655,34 @@ static void __init kvm_smp_prepare_boot_cpu(void) kvm_spinlock_init(); } -static void kvm_guest_cpu_offline(void) +static int kvm_cpu_down_prepare(unsigned int cpu) { - kvm_disable_steal_time(); - if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) - wrmsrl(MSR_KVM_PV_EOI_EN, 0); - kvm_pv_disable_apf(); - apf_task_wake_all(); -} + unsigned long flags; -static int kvm_cpu_online(unsigned int cpu) -{ - local_irq_disable(); - kvm_guest_cpu_init(); - local_irq_enable(); + local_irq_save(flags); + kvm_guest_cpu_offline(); + local_irq_restore(flags); return 0; } -static int kvm_cpu_down_prepare(unsigned int cpu) +#endif + +static int kvm_suspend(void) { - local_irq_disable(); kvm_guest_cpu_offline(); - local_irq_enable(); + return 0; } -#endif +static void kvm_resume(void) +{ + kvm_cpu_online(raw_smp_processor_id()); +} + +static struct syscore_ops kvm_syscore_ops = { + .suspend = kvm_suspend, + .resume = kvm_resume, +}; static void __init kvm_guest_init(void) { @@ -704,6 +726,8 @@ static void __init kvm_guest_init(void) kvm_guest_cpu_init(); #endif + register_syscore_ops(&kvm_syscore_ops); + /* * Hard lockup detection is enabled by default. Disable it, as guests * can get false positives too easily, for example if the host is From c02027b5742b5aa804ef08a4a9db433295533046 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 14 Apr 2021 14:35:42 +0200 Subject: [PATCH 028/263] x86/kvm: Disable kvmclock on all CPUs on shutdown Currenly, we disable kvmclock from machine_shutdown() hook and this only happens for boot CPU. We need to disable it for all CPUs to guard against memory corruption e.g. on restore from hibernate. Note, writing '0' to kvmclock MSR doesn't clear memory location, it just prevents hypervisor from updating the location so for the short while after write and while CPU is still alive, the clock remains usable and correct so we don't need to switch to some other clocksource. Signed-off-by: Vitaly Kuznetsov Message-Id: <20210414123544.1060604-4-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_para.h | 4 ++-- arch/x86/kernel/kvm.c | 1 + arch/x86/kernel/kvmclock.c | 5 +---- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 3381198525126..9c56e0defd453 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -7,8 +7,6 @@ #include #include -extern void kvmclock_init(void); - #ifdef CONFIG_KVM_GUEST bool kvm_check_and_clear_guest_paused(void); #else @@ -86,6 +84,8 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1, } #ifdef CONFIG_KVM_GUEST +void kvmclock_init(void); +void kvmclock_disable(void); bool kvm_para_available(void); unsigned int kvm_arch_para_features(void); unsigned int kvm_arch_para_hints(void); diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 9d5f96321c7f9..25dd126a33250 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -459,6 +459,7 @@ static void kvm_guest_cpu_offline(void) wrmsrl(MSR_KVM_PV_EOI_EN, 0); kvm_pv_disable_apf(); apf_task_wake_all(); + kvmclock_disable(); } static int kvm_cpu_online(unsigned int cpu) diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index d37ed4e1d0338..081686df6cd8a 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -220,11 +220,9 @@ static void kvm_crash_shutdown(struct pt_regs *regs) } #endif -static void kvm_shutdown(void) +void kvmclock_disable(void) { native_write_msr(msr_kvm_system_time, 0, 0); - kvm_disable_steal_time(); - native_machine_shutdown(); } static void __init kvmclock_init_mem(void) @@ -351,7 +349,6 @@ void __init kvmclock_init(void) #endif x86_platform.save_sched_clock_state = kvm_save_sched_clock_state; x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state; - machine_ops.shutdown = kvm_shutdown; #ifdef CONFIG_KEXEC_CORE machine_ops.crash_shutdown = kvm_crash_shutdown; #endif From 3d6b84132d2a57b5a74100f6923a8feb679ac2ce Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 14 Apr 2021 14:35:43 +0200 Subject: [PATCH 029/263] x86/kvm: Disable all PV features on crash Crash shutdown handler only disables kvmclock and steal time, other PV features remain active so we risk corrupting memory or getting some side-effects in kdump kernel. Move crash handler to kvm.c and unify with CPU offline. Signed-off-by: Vitaly Kuznetsov Message-Id: <20210414123544.1060604-5-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_para.h | 6 ----- arch/x86/kernel/kvm.c | 44 ++++++++++++++++++++++++--------- arch/x86/kernel/kvmclock.c | 21 ---------------- 3 files changed, 32 insertions(+), 39 deletions(-) diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 9c56e0defd453..69299878b200a 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -92,7 +92,6 @@ unsigned int kvm_arch_para_hints(void); void kvm_async_pf_task_wait_schedule(u32 token); void kvm_async_pf_task_wake(u32 token); u32 kvm_read_and_reset_apf_flags(void); -void kvm_disable_steal_time(void); bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token); DECLARE_STATIC_KEY_FALSE(kvm_async_pf_enabled); @@ -137,11 +136,6 @@ static inline u32 kvm_read_and_reset_apf_flags(void) return 0; } -static inline void kvm_disable_steal_time(void) -{ - return; -} - static __always_inline bool kvm_handle_async_pf(struct pt_regs *regs, u32 token) { return false; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 25dd126a33250..8eb91dc0f5a87 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -38,6 +38,7 @@ #include #include #include +#include #include DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled); @@ -375,6 +376,14 @@ static void kvm_pv_disable_apf(void) pr_info("disable async PF for cpu %d\n", smp_processor_id()); } +static void kvm_disable_steal_time(void) +{ + if (!has_steal_clock) + return; + + wrmsr(MSR_KVM_STEAL_TIME, 0, 0); +} + static void kvm_pv_guest_cpu_reboot(void *unused) { /* @@ -417,14 +426,6 @@ static u64 kvm_steal_clock(int cpu) return steal; } -void kvm_disable_steal_time(void) -{ - if (!has_steal_clock) - return; - - wrmsr(MSR_KVM_STEAL_TIME, 0, 0); -} - static inline void __set_percpu_decrypted(void *ptr, unsigned long size) { early_set_memory_decrypted((unsigned long) ptr, size); @@ -452,13 +453,14 @@ static void __init sev_map_percpu_data(void) } } -static void kvm_guest_cpu_offline(void) +static void kvm_guest_cpu_offline(bool shutdown) { kvm_disable_steal_time(); if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) wrmsrl(MSR_KVM_PV_EOI_EN, 0); kvm_pv_disable_apf(); - apf_task_wake_all(); + if (!shutdown) + apf_task_wake_all(); kvmclock_disable(); } @@ -661,7 +663,7 @@ static int kvm_cpu_down_prepare(unsigned int cpu) unsigned long flags; local_irq_save(flags); - kvm_guest_cpu_offline(); + kvm_guest_cpu_offline(false); local_irq_restore(flags); return 0; } @@ -670,7 +672,7 @@ static int kvm_cpu_down_prepare(unsigned int cpu) static int kvm_suspend(void) { - kvm_guest_cpu_offline(); + kvm_guest_cpu_offline(false); return 0; } @@ -685,6 +687,20 @@ static struct syscore_ops kvm_syscore_ops = { .resume = kvm_resume, }; +/* + * After a PV feature is registered, the host will keep writing to the + * registered memory location. If the guest happens to shutdown, this memory + * won't be valid. In cases like kexec, in which you install a new kernel, this + * means a random memory location will be kept being written. + */ +#ifdef CONFIG_KEXEC_CORE +static void kvm_crash_shutdown(struct pt_regs *regs) +{ + kvm_guest_cpu_offline(true); + native_machine_crash_shutdown(regs); +} +#endif + static void __init kvm_guest_init(void) { int i; @@ -727,6 +743,10 @@ static void __init kvm_guest_init(void) kvm_guest_cpu_init(); #endif +#ifdef CONFIG_KEXEC_CORE + machine_ops.crash_shutdown = kvm_crash_shutdown; +#endif + register_syscore_ops(&kvm_syscore_ops); /* diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 081686df6cd8a..ad273e5861c1b 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -20,7 +20,6 @@ #include #include #include -#include #include static int kvmclock __initdata = 1; @@ -203,23 +202,6 @@ static void kvm_setup_secondary_clock(void) } #endif -/* - * After the clock is registered, the host will keep writing to the - * registered memory location. If the guest happens to shutdown, this memory - * won't be valid. In cases like kexec, in which you install a new kernel, this - * means a random memory location will be kept being written. So before any - * kind of shutdown from our side, we unregister the clock by writing anything - * that does not have the 'enable' bit set in the msr - */ -#ifdef CONFIG_KEXEC_CORE -static void kvm_crash_shutdown(struct pt_regs *regs) -{ - native_write_msr(msr_kvm_system_time, 0, 0); - kvm_disable_steal_time(); - native_machine_crash_shutdown(regs); -} -#endif - void kvmclock_disable(void) { native_write_msr(msr_kvm_system_time, 0, 0); @@ -349,9 +331,6 @@ void __init kvmclock_init(void) #endif x86_platform.save_sched_clock_state = kvm_save_sched_clock_state; x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state; -#ifdef CONFIG_KEXEC_CORE - machine_ops.crash_shutdown = kvm_crash_shutdown; -#endif kvm_get_preset_lpj(); /* From 384fc672f528d3b84eacd9a86ecf35df3363b8ba Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 14 Apr 2021 14:35:44 +0200 Subject: [PATCH 030/263] x86/kvm: Unify kvm_pv_guest_cpu_reboot() with kvm_guest_cpu_offline() Simplify the code by making PV features shutdown happen in one place. Signed-off-by: Vitaly Kuznetsov Message-Id: <20210414123544.1060604-6-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvm.c | 42 +++++++++++++++++------------------------- 1 file changed, 17 insertions(+), 25 deletions(-) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 8eb91dc0f5a87..a26643dc6bd63 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -384,31 +384,6 @@ static void kvm_disable_steal_time(void) wrmsr(MSR_KVM_STEAL_TIME, 0, 0); } -static void kvm_pv_guest_cpu_reboot(void *unused) -{ - /* - * We disable PV EOI before we load a new kernel by kexec, - * since MSR_KVM_PV_EOI_EN stores a pointer into old kernel's memory. - * New kernel can re-enable when it boots. - */ - if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) - wrmsrl(MSR_KVM_PV_EOI_EN, 0); - kvm_pv_disable_apf(); - kvm_disable_steal_time(); -} - -static int kvm_pv_reboot_notify(struct notifier_block *nb, - unsigned long code, void *unused) -{ - if (code == SYS_RESTART) - on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1); - return NOTIFY_DONE; -} - -static struct notifier_block kvm_pv_reboot_nb = { - .notifier_call = kvm_pv_reboot_notify, -}; - static u64 kvm_steal_clock(int cpu) { u64 steal; @@ -687,6 +662,23 @@ static struct syscore_ops kvm_syscore_ops = { .resume = kvm_resume, }; +static void kvm_pv_guest_cpu_reboot(void *unused) +{ + kvm_guest_cpu_offline(true); +} + +static int kvm_pv_reboot_notify(struct notifier_block *nb, + unsigned long code, void *unused) +{ + if (code == SYS_RESTART) + on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1); + return NOTIFY_DONE; +} + +static struct notifier_block kvm_pv_reboot_nb = { + .notifier_call = kvm_pv_reboot_notify, +}; + /* * After a PV feature is registered, the host will keep writing to the * registered memory location. If the guest happens to shutdown, this memory From 46a63924b05f335b0765ad13dae4d2d7569f25c9 Mon Sep 17 00:00:00 2001 From: Siddharth Chandrasekaran Date: Mon, 3 May 2021 14:00:58 +0200 Subject: [PATCH 031/263] doc/kvm: Fix wrong entry for KVM_CAP_X86_MSR_FILTER The capability that exposes new ioctl KVM_X86_SET_MSR_FILTER to userspace is specified incorrectly as the ioctl itself (instead of KVM_CAP_X86_MSR_FILTER). This patch fixes it. Fixes: 1a155254ff93 ("KVM: x86: Introduce MSR filtering") Reviewed-by: Alexander Graf Signed-off-by: Siddharth Chandrasekaran Message-Id: <20210503120059.9283-1-sidcha@amazon.de> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 22d0775621496..7fcb2fd38f42e 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -4803,7 +4803,7 @@ KVM_PV_VM_VERIFY 4.126 KVM_X86_SET_MSR_FILTER ---------------------------- -:Capability: KVM_X86_SET_MSR_FILTER +:Capability: KVM_CAP_X86_MSR_FILTER :Architectures: x86 :Type: vm ioctl :Parameters: struct kvm_msr_filter @@ -6715,7 +6715,7 @@ accesses that would usually trigger a #GP by KVM into the guest will instead get bounced to user space through the KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR exit notifications. -8.27 KVM_X86_SET_MSR_FILTER +8.27 KVM_CAP_X86_MSR_FILTER --------------------------- :Architectures: x86 From f5c7e8425f18fdb9bdb7d13340651d7876890329 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 3 May 2021 17:08:51 +0200 Subject: [PATCH 032/263] KVM: nVMX: Always make an attempt to map eVMCS after migration When enlightened VMCS is in use and nested state is migrated with vmx_get_nested_state()/vmx_set_nested_state() KVM can't map evmcs page right away: evmcs gpa is not 'struct kvm_vmx_nested_state_hdr' and we can't read it from VP assist page because userspace may decide to restore HV_X64_MSR_VP_ASSIST_PAGE after restoring nested state (and QEMU, for example, does exactly that). To make sure eVMCS is mapped /vmx_set_nested_state() raises KVM_REQ_GET_NESTED_STATE_PAGES request. Commit f2c7ef3ba955 ("KVM: nSVM: cancel KVM_REQ_GET_NESTED_STATE_PAGES on nested vmexit") added KVM_REQ_GET_NESTED_STATE_PAGES clearing to nested_vmx_vmexit() to make sure MSR permission bitmap is not switched when an immediate exit from L2 to L1 happens right after migration (caused by a pending event, for example). Unfortunately, in the exact same situation we still need to have eVMCS mapped so nested_sync_vmcs12_to_shadow() reflects changes in VMCS12 to eVMCS. As a band-aid, restore nested_get_evmcs_page() when clearing KVM_REQ_GET_NESTED_STATE_PAGES in nested_vmx_vmexit(). The 'fix' is far from being ideal as we can't easily propagate possible failures and even if we could, this is most likely already too late to do so. The whole 'KVM_REQ_GET_NESTED_STATE_PAGES' idea for mapping eVMCS after migration seems to be fragile as we diverge too much from the 'native' path when vmptr loading happens on vmx_set_nested_state(). Fixes: f2c7ef3ba955 ("KVM: nSVM: cancel KVM_REQ_GET_NESTED_STATE_PAGES on nested vmexit") Signed-off-by: Vitaly Kuznetsov Message-Id: <20210503150854.1144255-2-vkuznets@redhat.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index bced766378232..6058a65a6ede6 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3098,15 +3098,8 @@ static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) nested_vmx_handle_enlightened_vmptrld(vcpu, false); if (evmptrld_status == EVMPTRLD_VMFAIL || - evmptrld_status == EVMPTRLD_ERROR) { - pr_debug_ratelimited("%s: enlightened vmptrld failed\n", - __func__); - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = - KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; + evmptrld_status == EVMPTRLD_ERROR) return false; - } } return true; @@ -3194,8 +3187,16 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu) { - if (!nested_get_evmcs_page(vcpu)) + if (!nested_get_evmcs_page(vcpu)) { + pr_debug_ratelimited("%s: enlightened vmptrld failed\n", + __func__); + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = + KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return false; + } if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu)) return false; @@ -4435,7 +4436,15 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, /* Similarly, triple faults in L2 should never escape. */ WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)); - kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); + if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { + /* + * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map + * Enlightened VMCS after migration and we still need to + * do that when something is forcing L2->L1 exit prior to + * the first L2 run. + */ + (void)nested_get_evmcs_page(vcpu); + } /* Service the TLB flush request for L2 before switching to L1. */ if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) From 32d1b3ab588c1231dbfa9eb08819c50529ce77d7 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 5 May 2021 17:18:21 +0200 Subject: [PATCH 033/263] KVM: selftests: evmcs_test: Check that VMLAUNCH with bogus EVMPTR is causing #UD 'run->exit_reason == KVM_EXIT_SHUTDOWN' check is not ideal as we may be getting some unexpected exception. Directly check for #UD instead. Signed-off-by: Vitaly Kuznetsov Message-Id: <20210505151823.1341678-2-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/x86_64/evmcs_test.c | 24 ++++++++++++------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/kvm/x86_64/evmcs_test.c b/tools/testing/selftests/kvm/x86_64/evmcs_test.c index ca22ee6d19cbd..b01f64ac6ce30 100644 --- a/tools/testing/selftests/kvm/x86_64/evmcs_test.c +++ b/tools/testing/selftests/kvm/x86_64/evmcs_test.c @@ -19,6 +19,14 @@ #define VCPU_ID 5 +static int ud_count; + +static void guest_ud_handler(struct ex_regs *regs) +{ + ud_count++; + regs->rip += 3; /* VMLAUNCH */ +} + void l2_guest_code(void) { GUEST_SYNC(7); @@ -71,11 +79,11 @@ void guest_code(struct vmx_pages *vmx_pages) if (vmx_pages) l1_guest_code(vmx_pages); - GUEST_DONE(); - /* Try enlightened vmptrld with an incorrect GPA */ evmcs_vmptrld(0xdeadbeef, vmx_pages->enlightened_vmcs); GUEST_ASSERT(vmlaunch()); + GUEST_ASSERT(ud_count == 1); + GUEST_DONE(); } int main(int argc, char *argv[]) @@ -109,6 +117,10 @@ int main(int argc, char *argv[]) vcpu_alloc_vmx(vm, &vmx_pages_gva); vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva); + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(vm, VCPU_ID); + vm_handle_exception(vm, UD_VECTOR, guest_ud_handler); + for (stage = 1;; stage++) { _vcpu_run(vm, VCPU_ID); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, @@ -124,7 +136,7 @@ int main(int argc, char *argv[]) case UCALL_SYNC: break; case UCALL_DONE: - goto part1_done; + goto done; default: TEST_FAIL("Unknown ucall %lu", uc.cmd); } @@ -156,10 +168,6 @@ int main(int argc, char *argv[]) (ulong) regs2.rdi, (ulong) regs2.rsi); } -part1_done: - _vcpu_run(vm, VCPU_ID); - TEST_ASSERT(run->exit_reason == KVM_EXIT_SHUTDOWN, - "Unexpected successful VMEnter with invalid eVMCS pointer!"); - +done: kvm_vm_free(vm); } From c9ecafaf0113a305f5085ceb9c7a4b64ca70eae9 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 5 May 2021 17:18:22 +0200 Subject: [PATCH 034/263] KVM: selftests: evmcs_test: Check that VMCS12 is alway properly synced to eVMCS after restore Add a test for the regression, introduced by commit f2c7ef3ba955 ("KVM: nSVM: cancel KVM_REQ_GET_NESTED_STATE_PAGES on nested vmexit"). When L2->L1 exit is forced immediately after restoring nested state, KVM_REQ_GET_NESTED_STATE_PAGES request is cleared and VMCS12 changes (e.g. fresh RIP) are not reflected to eVMCS. The consequent nested vCPU run gets broken. Utilize NMI injection to do the job. Signed-off-by: Vitaly Kuznetsov Message-Id: <20210505151823.1341678-3-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/x86_64/evmcs_test.c | 66 +++++++++++++++---- 1 file changed, 55 insertions(+), 11 deletions(-) diff --git a/tools/testing/selftests/kvm/x86_64/evmcs_test.c b/tools/testing/selftests/kvm/x86_64/evmcs_test.c index b01f64ac6ce30..63096cea26c61 100644 --- a/tools/testing/selftests/kvm/x86_64/evmcs_test.c +++ b/tools/testing/selftests/kvm/x86_64/evmcs_test.c @@ -18,30 +18,52 @@ #include "vmx.h" #define VCPU_ID 5 +#define NMI_VECTOR 2 static int ud_count; +void enable_x2apic(void) +{ + uint32_t spiv_reg = APIC_BASE_MSR + (APIC_SPIV >> 4); + + wrmsr(MSR_IA32_APICBASE, rdmsr(MSR_IA32_APICBASE) | + MSR_IA32_APICBASE_ENABLE | MSR_IA32_APICBASE_EXTD); + wrmsr(spiv_reg, rdmsr(spiv_reg) | APIC_SPIV_APIC_ENABLED); +} + static void guest_ud_handler(struct ex_regs *regs) { ud_count++; regs->rip += 3; /* VMLAUNCH */ } +static void guest_nmi_handler(struct ex_regs *regs) +{ +} + void l2_guest_code(void) { GUEST_SYNC(7); GUEST_SYNC(8); + /* Forced exit to L1 upon restore */ + GUEST_SYNC(9); + /* Done, exit to L1 and never come back. */ vmcall(); } -void l1_guest_code(struct vmx_pages *vmx_pages) +void guest_code(struct vmx_pages *vmx_pages) { #define L2_GUEST_STACK_SIZE 64 unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + enable_x2apic(); + + GUEST_SYNC(1); + GUEST_SYNC(2); + enable_vp_assist(vmx_pages->vp_assist_gpa, vmx_pages->vp_assist); GUEST_ASSERT(vmx_pages->vmcs_gpa); @@ -63,21 +85,22 @@ void l1_guest_code(struct vmx_pages *vmx_pages) current_evmcs->revision_id = EVMCS_VERSION; GUEST_SYNC(6); + current_evmcs->pin_based_vm_exec_control |= + PIN_BASED_NMI_EXITING; GUEST_ASSERT(!vmlaunch()); GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa); - GUEST_SYNC(9); + + /* + * NMI forces L2->L1 exit, resuming L2 and hope that EVMCS is + * up-to-date (RIP points where it should and not at the beginning + * of l2_guest_code(). GUEST_SYNC(9) checkes that. + */ GUEST_ASSERT(!vmresume()); - GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); - GUEST_SYNC(10); -} -void guest_code(struct vmx_pages *vmx_pages) -{ - GUEST_SYNC(1); - GUEST_SYNC(2); + GUEST_SYNC(10); - if (vmx_pages) - l1_guest_code(vmx_pages); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + GUEST_SYNC(11); /* Try enlightened vmptrld with an incorrect GPA */ evmcs_vmptrld(0xdeadbeef, vmx_pages->enlightened_vmcs); @@ -86,6 +109,18 @@ void guest_code(struct vmx_pages *vmx_pages) GUEST_DONE(); } +void inject_nmi(struct kvm_vm *vm) +{ + struct kvm_vcpu_events events; + + vcpu_events_get(vm, VCPU_ID, &events); + + events.nmi.pending = 1; + events.flags |= KVM_VCPUEVENT_VALID_NMI_PENDING; + + vcpu_events_set(vm, VCPU_ID, &events); +} + int main(int argc, char *argv[]) { vm_vaddr_t vmx_pages_gva = 0; @@ -120,6 +155,9 @@ int main(int argc, char *argv[]) vm_init_descriptor_tables(vm); vcpu_init_descriptor_tables(vm, VCPU_ID); vm_handle_exception(vm, UD_VECTOR, guest_ud_handler); + vm_handle_exception(vm, NMI_VECTOR, guest_nmi_handler); + + pr_info("Running L1 which uses EVMCS to run L2\n"); for (stage = 1;; stage++) { _vcpu_run(vm, VCPU_ID); @@ -166,6 +204,12 @@ int main(int argc, char *argv[]) TEST_ASSERT(!memcmp(®s1, ®s2, sizeof(regs2)), "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx", (ulong) regs2.rdi, (ulong) regs2.rsi); + + /* Force immediate L2->L1 exit before resuming */ + if (stage == 8) { + pr_info("Injecting NMI into L1 before L2 had a chance to run after restore\n"); + inject_nmi(vm); + } } done: From 70f094f4f01dc4d6f78ac6407f85627293a6553c Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 3 May 2021 17:08:52 +0200 Subject: [PATCH 035/263] KVM: nVMX: Properly pad 'struct kvm_vmx_nested_state_hdr' Eliminate the probably unwanted hole in 'struct kvm_vmx_nested_state_hdr': Pre-patch: struct kvm_vmx_nested_state_hdr { __u64 vmxon_pa; /* 0 8 */ __u64 vmcs12_pa; /* 8 8 */ struct { __u16 flags; /* 16 2 */ } smm; /* 16 2 */ /* XXX 2 bytes hole, try to pack */ __u32 flags; /* 20 4 */ __u64 preemption_timer_deadline; /* 24 8 */ }; Post-patch: struct kvm_vmx_nested_state_hdr { __u64 vmxon_pa; /* 0 8 */ __u64 vmcs12_pa; /* 8 8 */ struct { __u16 flags; /* 16 2 */ } smm; /* 16 2 */ __u16 pad; /* 18 2 */ __u32 flags; /* 20 4 */ __u64 preemption_timer_deadline; /* 24 8 */ }; Signed-off-by: Vitaly Kuznetsov Message-Id: <20210503150854.1144255-3-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/uapi/asm/kvm.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 5a3022c8af82b..0662f644aad9d 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -437,6 +437,8 @@ struct kvm_vmx_nested_state_hdr { __u16 flags; } smm; + __u16 pad; + __u32 flags; __u64 preemption_timer_deadline; }; From 5f443e424efab56baa8021da04878f88eb0815d4 Mon Sep 17 00:00:00 2001 From: Bill Wendling Date: Thu, 10 Dec 2020 17:23:17 -0800 Subject: [PATCH 036/263] selftests: kvm: remove reassignment of non-absolute variables Clang's integrated assembler does not allow symbols with non-absolute values to be reassigned. Modify the interrupt entry loop macro to be compatible with IAS by using a label and an offset. Cc: Jian Cai Signed-off-by: Bill Wendling References: https://lore.kernel.org/lkml/20200714233024.1789985-1-caij2003@gmail.com/ Message-Id: <20201211012317.3722214-1-morbo@google.com> Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/lib/x86_64/handlers.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/lib/x86_64/handlers.S b/tools/testing/selftests/kvm/lib/x86_64/handlers.S index aaf7bc7d2ce18..7629819734afd 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/handlers.S +++ b/tools/testing/selftests/kvm/lib/x86_64/handlers.S @@ -54,9 +54,9 @@ idt_handlers: .align 8 /* Fetch current address and append it to idt_handlers. */ - current_handler = . +666 : .pushsection .rodata -.quad current_handler + .quad 666b .popsection .if ! \has_error From aca352886ebdd675b5131ed4c83bf5477eee5d72 Mon Sep 17 00:00:00 2001 From: Siddharth Chandrasekaran Date: Mon, 3 May 2021 14:21:11 +0200 Subject: [PATCH 037/263] KVM: x86: Hoist input checks in kvm_add_msr_filter() In ioctl KVM_X86_SET_MSR_FILTER, input from user space is validated after a memdup_user(). For invalid inputs we'd memdup and then call kfree unnecessarily. Hoist input validation to avoid kfree altogether. Signed-off-by: Siddharth Chandrasekaran Message-Id: <20210503122111.13775-1-sidcha@amazon.de> Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 26 +++++++------------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index cebdaa1e3cf5b..102f116d9bb40 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5468,14 +5468,18 @@ static void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter) static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter, struct kvm_msr_filter_range *user_range) { - struct msr_bitmap_range range; unsigned long *bitmap = NULL; size_t bitmap_size; - int r; if (!user_range->nmsrs) return 0; + if (user_range->flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE)) + return -EINVAL; + + if (!user_range->flags) + return -EINVAL; + bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long); if (!bitmap_size || bitmap_size > KVM_MSR_FILTER_MAX_BITMAP_SIZE) return -EINVAL; @@ -5484,31 +5488,15 @@ static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter, if (IS_ERR(bitmap)) return PTR_ERR(bitmap); - range = (struct msr_bitmap_range) { + msr_filter->ranges[msr_filter->count] = (struct msr_bitmap_range) { .flags = user_range->flags, .base = user_range->base, .nmsrs = user_range->nmsrs, .bitmap = bitmap, }; - if (range.flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE)) { - r = -EINVAL; - goto err; - } - - if (!range.flags) { - r = -EINVAL; - goto err; - } - - /* Everything ok, add this range identifier. */ - msr_filter->ranges[msr_filter->count] = range; msr_filter->count++; - return 0; -err: - kfree(bitmap); - return r; } static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp) From 063ab16c14db5a2ef52d54d0475b7fed19c982d7 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 4 May 2021 17:39:35 +0300 Subject: [PATCH 038/263] KVM: nSVM: always restore the L1's GIF on migration While usually the L1's GIF is set while L2 runs, and usually migration nested state is loaded after a vCPU reset which also sets L1's GIF to true, this is not guaranteed. Signed-off-by: Maxim Levitsky Message-Id: <20210504143936.1644378-2-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 32400cba608d4..b331446f67f3b 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -1314,6 +1314,8 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, else svm->nested.vmcb02.ptr->save = svm->vmcb01.ptr->save; + svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET)); + svm->nested.nested_run_pending = !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); From 809c79137a192d7e881a517f803ebbf96305f066 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 4 May 2021 17:39:36 +0300 Subject: [PATCH 039/263] KVM: nSVM: remove a warning about vmcb01 VM exit reason While in most cases, when returning to use the VMCB01, the exit reason stored in it will be SVM_EXIT_VMRUN, on first VM exit after a nested migration this field can contain anything since the VM entry did happen before the migration. Remove this warning to avoid the false positive. Signed-off-by: Maxim Levitsky Message-Id: <20210504143936.1644378-3-mlevitsk@redhat.com> Fixes: 9a7de6ecc3ed ("KVM: nSVM: If VMRUN is single-stepped, queue the #DB intercept in nested_svm_vmexit()") Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index b331446f67f3b..5e8d8443154e8 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -764,7 +764,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm) nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr); svm_switch_vmcb(svm, &svm->vmcb01); - WARN_ON_ONCE(svm->vmcb->control.exit_code != SVM_EXIT_VMRUN); /* * On vmexit the GIF is set to false and From 8aec21c04caa2000f91cf8822ae0811e4b0c3971 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 4 May 2021 10:17:20 -0700 Subject: [PATCH 040/263] KVM: VMX: Do not advertise RDPID if ENABLE_RDTSCP control is unsupported Clear KVM's RDPID capability if the ENABLE_RDTSCP secondary exec control is unsupported. Despite being enumerated in a separate CPUID flag, RDPID is bundled under the same VMCS control as RDTSCP and will #UD in VMX non-root if ENABLE_RDTSCP is not enabled. Fixes: 41cd02c6f7f6 ("kvm: x86: Expose RDPID in KVM_GET_SUPPORTED_CPUID") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20210504171734.1434054-2-seanjc@google.com> Reviewed-by: Jim Mattson Reviewed-by: Reiji Watanabe Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index cbe0cdade38a5..46573b8626385 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7377,9 +7377,11 @@ static __init void vmx_set_cpu_caps(void) if (!cpu_has_vmx_xsaves()) kvm_cpu_cap_clear(X86_FEATURE_XSAVES); - /* CPUID 0x80000001 */ - if (!cpu_has_vmx_rdtscp()) + /* CPUID 0x80000001 and 0x7 (RDPID) */ + if (!cpu_has_vmx_rdtscp()) { kvm_cpu_cap_clear(X86_FEATURE_RDTSCP); + kvm_cpu_cap_clear(X86_FEATURE_RDPID); + } if (cpu_has_vmx_waitpkg()) kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); From 85d0011264da24be08ae907d7f29983a597ca9b1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 4 May 2021 10:17:21 -0700 Subject: [PATCH 041/263] KVM: x86: Emulate RDPID only if RDTSCP is supported Do not advertise emulation support for RDPID if RDTSCP is unsupported. RDPID emulation subtly relies on MSR_TSC_AUX to exist in hardware, as both vmx_get_msr() and svm_get_msr() will return an error if the MSR is unsupported, i.e. ctxt->ops->get_msr() will fail and the emulator will inject a #UD. Note, RDPID emulation also relies on RDTSCP being enabled in the guest, but this is a KVM bug and will eventually be fixed. Fixes: fb6d4d340e05 ("KVM: x86: emulate RDPID") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20210504171734.1434054-3-seanjc@google.com> Reviewed-by: Jim Mattson Reviewed-by: Reiji Watanabe Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 19606a3418889..c0e8c5e921898 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -637,7 +637,8 @@ static int __do_cpuid_func_emulated(struct kvm_cpuid_array *array, u32 func) case 7: entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; entry->eax = 0; - entry->ecx = F(RDPID); + if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) + entry->ecx = F(RDPID); ++array->nent; default: break; From 3b195ac9260235624b1c18f7bdaef184479c1d41 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 4 May 2021 10:17:22 -0700 Subject: [PATCH 042/263] KVM: SVM: Inject #UD on RDTSCP when it should be disabled in the guest Intercept RDTSCP to inject #UD if RDTSC is disabled in the guest. Note, SVM does not support intercepting RDPID. Unlike VMX's ENABLE_RDTSCP control, RDTSCP interception does not apply to RDPID. This is a benign virtualization hole as the host kernel (incorrectly) sets MSR_TSC_AUX if RDTSCP is supported, and KVM loads the guest's MSR_TSC_AUX into hardware if RDTSCP is supported in the host, i.e. KVM will not leak the host's MSR_TSC_AUX to the guest. But, when the kernel bug is fixed, KVM will start leaking the host's MSR_TSC_AUX if RDPID is supported in hardware, but RDTSCP isn't available for whatever reason. This leak will be remedied in a future commit. Fixes: 46896c73c1a4 ("KVM: svm: add support for RDTSCP") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20210504171734.1434054-4-seanjc@google.com> Reviewed-by: Jim Mattson Reviewed-by: Reiji Watanabe Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index be5cf612ab1fe..ebcb5849d69b1 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1100,7 +1100,9 @@ static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) return svm->vmcb->control.tsc_offset; } -static void svm_check_invpcid(struct vcpu_svm *svm) +/* Evaluate instruction intercepts that depend on guest CPUID features. */ +static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu, + struct vcpu_svm *svm) { /* * Intercept INVPCID if shadow paging is enabled to sync/free shadow @@ -1113,6 +1115,13 @@ static void svm_check_invpcid(struct vcpu_svm *svm) else svm_clr_intercept(svm, INTERCEPT_INVPCID); } + + if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) { + if (guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) + svm_clr_intercept(svm, INTERCEPT_RDTSCP); + else + svm_set_intercept(svm, INTERCEPT_RDTSCP); + } } static void init_vmcb(struct kvm_vcpu *vcpu) @@ -1248,7 +1257,7 @@ static void init_vmcb(struct kvm_vcpu *vcpu) svm_clr_intercept(svm, INTERCEPT_PAUSE); } - svm_check_invpcid(svm); + svm_recalc_instruction_intercepts(vcpu, svm); /* * If the host supports V_SPEC_CTRL then disable the interception @@ -3084,6 +3093,7 @@ static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = { [SVM_EXIT_STGI] = stgi_interception, [SVM_EXIT_CLGI] = clgi_interception, [SVM_EXIT_SKINIT] = skinit_interception, + [SVM_EXIT_RDTSCP] = kvm_handle_invalid_op, [SVM_EXIT_WBINVD] = kvm_emulate_wbinvd, [SVM_EXIT_MONITOR] = kvm_emulate_monitor, [SVM_EXIT_MWAIT] = kvm_emulate_mwait, @@ -4007,8 +4017,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) && guest_cpuid_has(vcpu, X86_FEATURE_NRIPS); - /* Check again if INVPCID interception if required */ - svm_check_invpcid(svm); + svm_recalc_instruction_intercepts(vcpu, svm); /* For sev guests, the memory encryption bit is not reserved in CR3. */ if (sev_guest(vcpu->kvm)) { From 2183de4161b90bd3851ccd3910c87b2c9adfc6ed Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 4 May 2021 10:17:23 -0700 Subject: [PATCH 043/263] KVM: x86: Move RDPID emulation intercept to its own enum Add a dedicated intercept enum for RDPID instead of piggybacking RDTSCP. Unlike VMX's ENABLE_RDTSCP, RDPID is not bound to SVM's RDTSCP intercept. Fixes: fb6d4d340e05 ("KVM: x86: emulate RDPID") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20210504171734.1434054-5-seanjc@google.com> Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 2 +- arch/x86/kvm/kvm_emulate.h | 1 + arch/x86/kvm/vmx/vmx.c | 3 ++- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 77e1c89a95a7f..8a0ccdb560766 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4502,7 +4502,7 @@ static const struct opcode group8[] = { * from the register case of group9. */ static const struct gprefix pfx_0f_c7_7 = { - N, N, N, II(DstMem | ModRM | Op3264 | EmulateOnUD, em_rdpid, rdtscp), + N, N, N, II(DstMem | ModRM | Op3264 | EmulateOnUD, em_rdpid, rdpid), }; diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index 0d359115429ad..f016838faedd6 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -468,6 +468,7 @@ enum x86_intercept { x86_intercept_clgi, x86_intercept_skinit, x86_intercept_rdtscp, + x86_intercept_rdpid, x86_intercept_icebp, x86_intercept_wbinvd, x86_intercept_monitor, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 46573b8626385..4a625c7482756 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7437,8 +7437,9 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu, /* * RDPID causes #UD if disabled through secondary execution controls. * Because it is marked as EmulateOnUD, we need to intercept it here. + * Note, RDPID is hidden behind ENABLE_RDTSCP. */ - case x86_intercept_rdtscp: + case x86_intercept_rdpid: if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) { exception->vector = UD_VECTOR; exception->error_code_valid = false; From 5104d7ffcf24749939bea7fdb5378d186473f890 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 4 May 2021 10:17:24 -0700 Subject: [PATCH 044/263] KVM: VMX: Disable preemption when probing user return MSRs Disable preemption when probing a user return MSR via RDSMR/WRMSR. If the MSR holds a different value per logical CPU, the WRMSR could corrupt the host's value if KVM is preempted between the RDMSR and WRMSR, and then rescheduled on a different CPU. Opportunistically land the helper in common x86, SVM will use the helper in a future commit. Fixes: 4be534102624 ("KVM: VMX: Initialize vmx->guest_msrs[] right after allocation") Cc: stable@vger.kernel.org Cc: Xiaoyao Li Signed-off-by: Sean Christopherson Message-Id: <20210504171734.1434054-6-seanjc@google.com> Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/vmx/vmx.c | 5 +---- arch/x86/kvm/x86.c | 16 ++++++++++++++++ 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 848956bb3cf1d..e8dcbc632cf8e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1777,6 +1777,7 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, unsigned long icr, int op_64_bit); void kvm_define_user_return_msr(unsigned index, u32 msr); +int kvm_probe_user_return_msr(u32 msr); int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask); u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 4a625c7482756..11ff9c3d95d53 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6914,12 +6914,9 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) { u32 index = vmx_uret_msrs_list[i]; - u32 data_low, data_high; int j = vmx->nr_uret_msrs; - if (rdmsr_safe(index, &data_low, &data_high) < 0) - continue; - if (wrmsr_safe(index, data_low, data_high) < 0) + if (kvm_probe_user_return_msr(index)) continue; vmx->guest_uret_msrs[j].slot = i; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 102f116d9bb40..bd90c73c37b4d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -339,6 +339,22 @@ static void kvm_on_user_return(struct user_return_notifier *urn) } } +int kvm_probe_user_return_msr(u32 msr) +{ + u64 val; + int ret; + + preempt_disable(); + ret = rdmsrl_safe(msr, &val); + if (ret) + goto out; + ret = wrmsrl_safe(msr, val); +out: + preempt_enable(); + return ret; +} +EXPORT_SYMBOL_GPL(kvm_probe_user_return_msr); + void kvm_define_user_return_msr(unsigned slot, u32 msr) { BUG_ON(slot >= KVM_MAX_NR_USER_RETURN_MSRS); From 0caa0a77c2f6fcd0830cdcd018db1af98fe35e28 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 4 May 2021 10:17:25 -0700 Subject: [PATCH 045/263] KVM: SVM: Probe and load MSR_TSC_AUX regardless of RDTSCP support in host Probe MSR_TSC_AUX whether or not RDTSCP is supported in the host, and if probing succeeds, load the guest's MSR_TSC_AUX into hardware prior to VMRUN. Because SVM doesn't support interception of RDPID, RDPID cannot be disallowed in the guest (without resorting to binary translation). Leaving the host's MSR_TSC_AUX in hardware would leak the host's value to the guest if RDTSCP is not supported. Note, there is also a kernel bug that prevents leaking the host's value. The host kernel initializes MSR_TSC_AUX if and only if RDTSCP is supported, even though the vDSO usage consumes MSR_TSC_AUX via RDPID. I.e. if RDTSCP is not supported, there is no host value to leak. But, if/when the host kernel bug is fixed, KVM would start leaking MSR_TSC_AUX in the case where hardware supports RDPID but RDTSCP is unavailable for whatever reason. Probing MSR_TSC_AUX will also allow consolidating the probe and define logic in common x86, and will make it simpler to condition the existence of MSR_TSX_AUX (from the guest's perspective) on RDTSCP *or* RDPID. Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20210504171734.1434054-7-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index ebcb5849d69b1..13e4dd128177d 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -212,7 +212,7 @@ DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to * defer the restoration of TSC_AUX until the CPU returns to userspace. */ -#define TSC_AUX_URET_SLOT 0 +static int tsc_aux_uret_slot __read_mostly = -1; static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; @@ -959,8 +959,10 @@ static __init int svm_hardware_setup(void) kvm_tsc_scaling_ratio_frac_bits = 32; } - if (boot_cpu_has(X86_FEATURE_RDTSCP)) - kvm_define_user_return_msr(TSC_AUX_URET_SLOT, MSR_TSC_AUX); + if (!kvm_probe_user_return_msr(MSR_TSC_AUX)) { + tsc_aux_uret_slot = 0; + kvm_define_user_return_msr(tsc_aux_uret_slot, MSR_TSC_AUX); + } /* Check for pause filtering support */ if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) { @@ -1454,8 +1456,8 @@ static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) } } - if (static_cpu_has(X86_FEATURE_RDTSCP)) - kvm_set_user_return_msr(TSC_AUX_URET_SLOT, svm->tsc_aux, -1ull); + if (likely(tsc_aux_uret_slot >= 0)) + kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull); svm->guest_state_loaded = true; } @@ -2664,7 +2666,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data |= (u64)svm->sysenter_esp_hi << 32; break; case MSR_TSC_AUX: - if (!boot_cpu_has(X86_FEATURE_RDTSCP)) + if (tsc_aux_uret_slot < 0) return 1; if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) @@ -2885,7 +2887,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0; break; case MSR_TSC_AUX: - if (!boot_cpu_has(X86_FEATURE_RDTSCP)) + if (tsc_aux_uret_slot < 0) return 1; if (!msr->host_initiated && @@ -2908,7 +2910,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) * guest via direct_access_msrs, and switch it via user return. */ preempt_disable(); - r = kvm_set_user_return_msr(TSC_AUX_URET_SLOT, data, -1ull); + r = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull); preempt_enable(); if (r) return 1; From 36fa06f9ff39f23e03cd8206dc6bbb7711c23be6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 4 May 2021 10:17:26 -0700 Subject: [PATCH 046/263] KVM: x86: Add support for RDPID without RDTSCP Allow userspace to enable RDPID for a guest without also enabling RDTSCP. Aside from checking for RDPID support in the obvious flows, VMX also needs to set ENABLE_RDTSCP=1 when RDPID is exposed. For the record, there is no known scenario where enabling RDPID without RDTSCP is desirable. But, both AMD and Intel architectures allow for the condition, i.e. this is purely to make KVM more architecturally accurate. Fixes: 41cd02c6f7f6 ("kvm: x86: Expose RDPID in KVM_GET_SUPPORTED_CPUID") Cc: stable@vger.kernel.org Reported-by: Reiji Watanabe Signed-off-by: Sean Christopherson Message-Id: <20210504171734.1434054-8-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 6 ++++-- arch/x86/kvm/vmx/vmx.c | 27 +++++++++++++++++++++++---- arch/x86/kvm/x86.c | 3 ++- 3 files changed, 29 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 13e4dd128177d..0ba0a00f8dc6a 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2669,7 +2669,8 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (tsc_aux_uret_slot < 0) return 1; if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) + !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) && + !guest_cpuid_has(vcpu, X86_FEATURE_RDPID)) return 1; msr_info->data = svm->tsc_aux; break; @@ -2891,7 +2892,8 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) return 1; if (!msr->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) + !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) && + !guest_cpuid_has(vcpu, X86_FEATURE_RDPID)) return 1; /* diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 11ff9c3d95d53..b304e372aab3c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1788,7 +1788,8 @@ static void setup_msrs(struct vcpu_vmx *vmx) if (update_transition_efer(vmx)) vmx_setup_uret_msr(vmx, MSR_EFER); - if (guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP)) + if (guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP) || + guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDPID)) vmx_setup_uret_msr(vmx, MSR_TSC_AUX); vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL); @@ -1994,7 +1995,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_TSC_AUX: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) + !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) && + !guest_cpuid_has(vcpu, X86_FEATURE_RDPID)) return 1; goto find_uret_msr; case MSR_IA32_DEBUGCTLMSR: @@ -2314,7 +2316,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_TSC_AUX: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) + !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) && + !guest_cpuid_has(vcpu, X86_FEATURE_RDPID)) return 1; /* Check reserved bit, higher 32 bits should be zero */ if ((data >> 32) != 0) @@ -4368,7 +4371,23 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) xsaves_enabled, false); } - vmx_adjust_sec_exec_feature(vmx, &exec_control, rdtscp, RDTSCP); + /* + * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either + * feature is exposed to the guest. This creates a virtualization hole + * if both are supported in hardware but only one is exposed to the + * guest, but letting the guest execute RDTSCP or RDPID when either one + * is advertised is preferable to emulating the advertised instruction + * in KVM on #UD, and obviously better than incorrectly injecting #UD. + */ + if (cpu_has_vmx_rdtscp()) { + bool rdpid_or_rdtscp_enabled = + guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) || + guest_cpuid_has(vcpu, X86_FEATURE_RDPID); + + vmx_adjust_secondary_exec_control(vmx, &exec_control, + SECONDARY_EXEC_ENABLE_RDTSCP, + rdpid_or_rdtscp_enabled, false); + } vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID); vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bd90c73c37b4d..0856636efc44c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5941,7 +5941,8 @@ static void kvm_init_msr_list(void) continue; break; case MSR_TSC_AUX: - if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) + if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP) && + !kvm_cpu_cap_has(X86_FEATURE_RDPID)) continue; break; case MSR_IA32_UMWAIT_CONTROL: From b6194b94a2ca4affce5aab1bbf773a977ad73671 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 4 May 2021 10:17:27 -0700 Subject: [PATCH 047/263] KVM: VMX: Configure list of user return MSRs at module init Configure the list of user return MSRs that are actually supported at module init instead of reprobing the list of possible MSRs every time a vCPU is created. Curating the list on a per-vCPU basis is pointless; KVM is completely hosed if the set of supported MSRs changes after module init, or if the set of MSRs differs per physical PCU. The per-vCPU lists also increase complexity (see __vmx_find_uret_msr()) and creates corner cases that _should_ be impossible, but theoretically exist in KVM, e.g. advertising RDTSCP to userspace without actually being able to virtualize RDTSCP if probing MSR_TSC_AUX fails. Signed-off-by: Sean Christopherson Message-Id: <20210504171734.1434054-9-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 61 ++++++++++++++++++++++++++++-------------- arch/x86/kvm/vmx/vmx.h | 10 ++++++- 2 files changed, 50 insertions(+), 21 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index b304e372aab3c..887db1af13125 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -461,7 +461,7 @@ static unsigned long host_idt_base; * support this emulation, IA32_STAR must always be included in * vmx_uret_msrs_list[], even in i386 builds. */ -static const u32 vmx_uret_msrs_list[] = { +static u32 vmx_uret_msrs_list[] = { #ifdef CONFIG_X86_64 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, #endif @@ -469,6 +469,12 @@ static const u32 vmx_uret_msrs_list[] = { MSR_IA32_TSX_CTRL, }; +/* + * Number of user return MSRs that are actually supported in hardware. + * vmx_uret_msrs_list is modified when KVM is loaded to drop unsupported MSRs. + */ +static int vmx_nr_uret_msrs; + #if IS_ENABLED(CONFIG_HYPERV) static bool __read_mostly enlightened_vmcs = true; module_param(enlightened_vmcs, bool, 0444); @@ -700,9 +706,16 @@ static inline int __vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr) { int i; - for (i = 0; i < vmx->nr_uret_msrs; ++i) + /* + * Note, vmx->guest_uret_msrs is the same size as vmx_uret_msrs_list, + * but is ordered differently. The MSR is matched against the list of + * supported uret MSRs using "slot", but the index that is returned is + * the index into guest_uret_msrs. + */ + for (i = 0; i < vmx_nr_uret_msrs; ++i) { if (vmx_uret_msrs_list[vmx->guest_uret_msrs[i].slot] == msr) return i; + } return -1; } @@ -6929,18 +6942,10 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) goto free_vpid; } - BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS); - - for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) { - u32 index = vmx_uret_msrs_list[i]; - int j = vmx->nr_uret_msrs; + for (i = 0; i < vmx_nr_uret_msrs; ++i) { + vmx->guest_uret_msrs[i].data = 0; - if (kvm_probe_user_return_msr(index)) - continue; - - vmx->guest_uret_msrs[j].slot = i; - vmx->guest_uret_msrs[j].data = 0; - switch (index) { + switch (vmx_uret_msrs_list[i]) { case MSR_IA32_TSX_CTRL: /* * TSX_CTRL_CPUID_CLEAR is handled in the CPUID @@ -6954,15 +6959,14 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) * host so that TSX remains always disabled. */ if (boot_cpu_has(X86_FEATURE_RTM)) - vmx->guest_uret_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR; + vmx->guest_uret_msrs[i].mask = ~(u64)TSX_CTRL_CPUID_CLEAR; else - vmx->guest_uret_msrs[j].mask = 0; + vmx->guest_uret_msrs[i].mask = 0; break; default: - vmx->guest_uret_msrs[j].mask = -1ull; + vmx->guest_uret_msrs[i].mask = -1ull; break; } - ++vmx->nr_uret_msrs; } err = alloc_loaded_vmcs(&vmx->vmcs01); @@ -7821,17 +7825,34 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector, }; +static __init void vmx_setup_user_return_msrs(void) +{ + u32 msr; + int i; + + BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS); + + for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) { + msr = vmx_uret_msrs_list[i]; + + if (kvm_probe_user_return_msr(msr)) + continue; + + kvm_define_user_return_msr(vmx_nr_uret_msrs, msr); + vmx_uret_msrs_list[vmx_nr_uret_msrs++] = msr; + } +} + static __init int hardware_setup(void) { unsigned long host_bndcfgs; struct desc_ptr dt; - int r, i, ept_lpage_level; + int r, ept_lpage_level; store_idt(&dt); host_idt_base = dt.address; - for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) - kvm_define_user_return_msr(i, vmx_uret_msrs_list[i]); + vmx_setup_user_return_msrs(); if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0) return -EIO; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 008cb87ff088c..d71ed8b425c52 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -245,8 +245,16 @@ struct vcpu_vmx { u32 idt_vectoring_info; ulong rflags; + /* + * User return MSRs are always emulated when enabled in the guest, but + * only loaded into hardware when necessary, e.g. SYSCALL #UDs outside + * of 64-bit mode or if EFER.SCE=1, thus the SYSCALL MSRs don't need to + * be loaded into hardware if those conditions aren't met. + * nr_active_uret_msrs tracks the number of MSRs that need to be loaded + * into hardware when running the guest. guest_uret_msrs[] is resorted + * whenever the number of "active" uret MSRs is modified. + */ struct vmx_uret_msr guest_uret_msrs[MAX_NR_USER_RETURN_MSRS]; - int nr_uret_msrs; int nr_active_uret_msrs; bool guest_uret_msrs_loaded; #ifdef CONFIG_X86_64 From ee9d22e08d1341692a43926e5e1d84c90a5dac1d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 4 May 2021 10:17:28 -0700 Subject: [PATCH 048/263] KVM: VMX: Use flag to indicate "active" uret MSRs instead of sorting list Explicitly flag a uret MSR as needing to be loaded into hardware instead of resorting the list of "active" MSRs and tracking how many MSRs in total need to be loaded. The only benefit to sorting the list is that the loop to load MSRs during vmx_prepare_switch_to_guest() doesn't need to iterate over all supported uret MRS, only those that are active. But that is a pointless optimization, as the most common case, running a 64-bit guest, will load the vast majority of MSRs. Not to mention that a single WRMSR is far more expensive than iterating over the list. Providing a stable list order obviates the need to track a given MSR's "slot" in the per-CPU list of user return MSRs; all lists simply use the same ordering. Future patches will take advantage of the stable order to further simplify the related code. Signed-off-by: Sean Christopherson Message-Id: <20210504171734.1434054-10-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 80 ++++++++++++++++++++++-------------------- arch/x86/kvm/vmx/vmx.h | 2 +- 2 files changed, 42 insertions(+), 40 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 887db1af13125..adefedac0e3b9 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -458,8 +458,9 @@ static unsigned long host_idt_base; * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm * will emulate SYSCALL in legacy mode if the vendor string in guest * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To - * support this emulation, IA32_STAR must always be included in - * vmx_uret_msrs_list[], even in i386 builds. + * support this emulation, MSR_STAR is included in the list for i386, + * but is never loaded into hardware. MSR_CSTAR is also never loaded + * into hardware and is here purely for emulation purposes. */ static u32 vmx_uret_msrs_list[] = { #ifdef CONFIG_X86_64 @@ -702,18 +703,12 @@ static bool is_valid_passthrough_msr(u32 msr) return r; } -static inline int __vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr) +static inline int __vmx_find_uret_msr(u32 msr) { int i; - /* - * Note, vmx->guest_uret_msrs is the same size as vmx_uret_msrs_list, - * but is ordered differently. The MSR is matched against the list of - * supported uret MSRs using "slot", but the index that is returned is - * the index into guest_uret_msrs. - */ for (i = 0; i < vmx_nr_uret_msrs; ++i) { - if (vmx_uret_msrs_list[vmx->guest_uret_msrs[i].slot] == msr) + if (vmx_uret_msrs_list[i] == msr) return i; } return -1; @@ -723,7 +718,7 @@ struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr) { int i; - i = __vmx_find_uret_msr(vmx, msr); + i = __vmx_find_uret_msr(msr); if (i >= 0) return &vmx->guest_uret_msrs[i]; return NULL; @@ -732,13 +727,14 @@ struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr) static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx, struct vmx_uret_msr *msr, u64 data) { + unsigned int slot = msr - vmx->guest_uret_msrs; int ret = 0; u64 old_msr_data = msr->data; msr->data = data; - if (msr - vmx->guest_uret_msrs < vmx->nr_active_uret_msrs) { + if (msr->load_into_hardware) { preempt_disable(); - ret = kvm_set_user_return_msr(msr->slot, msr->data, msr->mask); + ret = kvm_set_user_return_msr(slot, msr->data, msr->mask); preempt_enable(); if (ret) msr->data = old_msr_data; @@ -1090,7 +1086,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx) return false; } - i = __vmx_find_uret_msr(vmx, MSR_EFER); + i = __vmx_find_uret_msr(MSR_EFER); if (i < 0) return false; @@ -1252,11 +1248,14 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) */ if (!vmx->guest_uret_msrs_loaded) { vmx->guest_uret_msrs_loaded = true; - for (i = 0; i < vmx->nr_active_uret_msrs; ++i) - kvm_set_user_return_msr(vmx->guest_uret_msrs[i].slot, + for (i = 0; i < vmx_nr_uret_msrs; ++i) { + if (!vmx->guest_uret_msrs[i].load_into_hardware) + continue; + + kvm_set_user_return_msr(i, vmx->guest_uret_msrs[i].data, vmx->guest_uret_msrs[i].mask); - + } } if (vmx->nested.need_vmcs12_to_shadow_sync) @@ -1763,19 +1762,16 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu) vmx_clear_hlt(vcpu); } -static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr) +static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr, + bool load_into_hardware) { - struct vmx_uret_msr tmp; - int from, to; + struct vmx_uret_msr *uret_msr; - from = __vmx_find_uret_msr(vmx, msr); - if (from < 0) + uret_msr = vmx_find_uret_msr(vmx, msr); + if (!uret_msr) return; - to = vmx->nr_active_uret_msrs++; - tmp = vmx->guest_uret_msrs[to]; - vmx->guest_uret_msrs[to] = vmx->guest_uret_msrs[from]; - vmx->guest_uret_msrs[from] = tmp; + uret_msr->load_into_hardware = load_into_hardware; } /* @@ -1785,30 +1781,36 @@ static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr) */ static void setup_msrs(struct vcpu_vmx *vmx) { - vmx->guest_uret_msrs_loaded = false; - vmx->nr_active_uret_msrs = 0; #ifdef CONFIG_X86_64 + bool load_syscall_msrs; + /* * The SYSCALL MSRs are only needed on long mode guests, and only * when EFER.SCE is set. */ - if (is_long_mode(&vmx->vcpu) && (vmx->vcpu.arch.efer & EFER_SCE)) { - vmx_setup_uret_msr(vmx, MSR_STAR); - vmx_setup_uret_msr(vmx, MSR_LSTAR); - vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK); - } + load_syscall_msrs = is_long_mode(&vmx->vcpu) && + (vmx->vcpu.arch.efer & EFER_SCE); + + vmx_setup_uret_msr(vmx, MSR_STAR, load_syscall_msrs); + vmx_setup_uret_msr(vmx, MSR_LSTAR, load_syscall_msrs); + vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK, load_syscall_msrs); #endif - if (update_transition_efer(vmx)) - vmx_setup_uret_msr(vmx, MSR_EFER); + vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx)); - if (guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP) || - guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDPID)) - vmx_setup_uret_msr(vmx, MSR_TSC_AUX); + vmx_setup_uret_msr(vmx, MSR_TSC_AUX, + guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP) || + guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDPID)); - vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL); + vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, true); if (cpu_has_vmx_msr_bitmap()) vmx_update_msr_bitmap(&vmx->vcpu); + + /* + * The set of MSRs to load may have changed, reload MSRs before the + * next VM-Enter. + */ + vmx->guest_uret_msrs_loaded = false; } static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index d71ed8b425c52..16e4e457ba23c 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -36,7 +36,7 @@ struct vmx_msrs { }; struct vmx_uret_msr { - unsigned int slot; /* The MSR's slot in kvm_user_return_msrs. */ + bool load_into_hardware; u64 data; u64 mask; }; From 8ea8b8d6f869425e21f34e60bdbe7e47e6c9d6b9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 4 May 2021 10:17:29 -0700 Subject: [PATCH 049/263] KVM: VMX: Use common x86's uret MSR list as the one true list Drop VMX's global list of user return MSRs now that VMX doesn't resort said list to isolate "active" MSRs, i.e. now that VMX's list and x86's list have the same MSRs in the same order. In addition to eliminating the redundant list, this will also allow moving more of the list management into common x86. Signed-off-by: Sean Christopherson Message-Id: <20210504171734.1434054-11-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/vmx/vmx.c | 97 ++++++++++++++------------------- arch/x86/kvm/x86.c | 12 ++++ 3 files changed, 53 insertions(+), 57 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e8dcbc632cf8e..6b15f27f49d01 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1777,6 +1777,7 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, unsigned long icr, int op_64_bit); void kvm_define_user_return_msr(unsigned index, u32 msr); +int kvm_find_user_return_msr(u32 msr); int kvm_probe_user_return_msr(u32 msr); int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index adefedac0e3b9..39506704be966 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -454,26 +454,7 @@ static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) static unsigned long host_idt_base; -/* - * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm - * will emulate SYSCALL in legacy mode if the vendor string in guest - * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To - * support this emulation, MSR_STAR is included in the list for i386, - * but is never loaded into hardware. MSR_CSTAR is also never loaded - * into hardware and is here purely for emulation purposes. - */ -static u32 vmx_uret_msrs_list[] = { -#ifdef CONFIG_X86_64 - MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, -#endif - MSR_EFER, MSR_TSC_AUX, MSR_STAR, - MSR_IA32_TSX_CTRL, -}; - -/* - * Number of user return MSRs that are actually supported in hardware. - * vmx_uret_msrs_list is modified when KVM is loaded to drop unsupported MSRs. - */ +/* Number of user return MSRs that are actually supported in hardware. */ static int vmx_nr_uret_msrs; #if IS_ENABLED(CONFIG_HYPERV) @@ -703,22 +684,11 @@ static bool is_valid_passthrough_msr(u32 msr) return r; } -static inline int __vmx_find_uret_msr(u32 msr) -{ - int i; - - for (i = 0; i < vmx_nr_uret_msrs; ++i) { - if (vmx_uret_msrs_list[i] == msr) - return i; - } - return -1; -} - struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr) { int i; - i = __vmx_find_uret_msr(msr); + i = kvm_find_user_return_msr(msr); if (i >= 0) return &vmx->guest_uret_msrs[i]; return NULL; @@ -1086,7 +1056,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx) return false; } - i = __vmx_find_uret_msr(MSR_EFER); + i = kvm_find_user_return_msr(MSR_EFER); if (i < 0) return false; @@ -6922,6 +6892,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) static int vmx_create_vcpu(struct kvm_vcpu *vcpu) { + struct vmx_uret_msr *tsx_ctrl; struct vcpu_vmx *vmx; int i, cpu, err; @@ -6946,29 +6917,25 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) for (i = 0; i < vmx_nr_uret_msrs; ++i) { vmx->guest_uret_msrs[i].data = 0; - - switch (vmx_uret_msrs_list[i]) { - case MSR_IA32_TSX_CTRL: - /* - * TSX_CTRL_CPUID_CLEAR is handled in the CPUID - * interception. Keep the host value unchanged to avoid - * changing CPUID bits under the host kernel's feet. - * - * hle=0, rtm=0, tsx_ctrl=1 can be found with some - * combinations of new kernel and old userspace. If - * those guests run on a tsx=off host, do allow guests - * to use TSX_CTRL, but do not change the value on the - * host so that TSX remains always disabled. - */ - if (boot_cpu_has(X86_FEATURE_RTM)) - vmx->guest_uret_msrs[i].mask = ~(u64)TSX_CTRL_CPUID_CLEAR; - else - vmx->guest_uret_msrs[i].mask = 0; - break; - default: - vmx->guest_uret_msrs[i].mask = -1ull; - break; - } + vmx->guest_uret_msrs[i].mask = -1ull; + } + tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); + if (tsx_ctrl) { + /* + * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception. + * Keep the host value unchanged to avoid changing CPUID bits + * under the host kernel's feet. + * + * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations + * of new kernel and old userspace. If those guests run on a + * tsx=off host, do allow guests to use TSX_CTRL, but do not + * change the value on the host so that TSX remains always + * disabled. + */ + if (boot_cpu_has(X86_FEATURE_RTM)) + vmx->guest_uret_msrs[i].mask = ~(u64)TSX_CTRL_CPUID_CLEAR; + else + vmx->guest_uret_msrs[i].mask = 0; } err = alloc_loaded_vmcs(&vmx->vmcs01); @@ -7829,6 +7796,22 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { static __init void vmx_setup_user_return_msrs(void) { + + /* + * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm + * will emulate SYSCALL in legacy mode if the vendor string in guest + * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To + * support this emulation, MSR_STAR is included in the list for i386, + * but is never loaded into hardware. MSR_CSTAR is also never loaded + * into hardware and is here purely for emulation purposes. + */ + const u32 vmx_uret_msrs_list[] = { + #ifdef CONFIG_X86_64 + MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, + #endif + MSR_EFER, MSR_TSC_AUX, MSR_STAR, + MSR_IA32_TSX_CTRL, + }; u32 msr; int i; @@ -7841,7 +7824,7 @@ static __init void vmx_setup_user_return_msrs(void) continue; kvm_define_user_return_msr(vmx_nr_uret_msrs, msr); - vmx_uret_msrs_list[vmx_nr_uret_msrs++] = msr; + vmx_nr_uret_msrs++; } } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0856636efc44c..d514031ed25f8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -364,6 +364,18 @@ void kvm_define_user_return_msr(unsigned slot, u32 msr) } EXPORT_SYMBOL_GPL(kvm_define_user_return_msr); +int kvm_find_user_return_msr(u32 msr) +{ + int i; + + for (i = 0; i < user_return_msrs_global.nr; ++i) { + if (user_return_msrs_global.msrs[i] == msr) + return i; + } + return -1; +} +EXPORT_SYMBOL_GPL(kvm_find_user_return_msr); + static void kvm_user_return_msr_cpu_online(void) { unsigned int cpu = smp_processor_id(); From 5e17c624010a82bbcca9b955155781927eb6532a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 4 May 2021 10:17:30 -0700 Subject: [PATCH 050/263] KVM: VMX: Disable loading of TSX_CTRL MSR the more conventional way Tag TSX_CTRL as not needing to be loaded when RTM isn't supported in the host. Crushing the write mask to '0' has the same effect, but requires more mental gymnastics to understand. Signed-off-by: Sean Christopherson Message-Id: <20210504171734.1434054-12-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 39506704be966..bd2187b4cc530 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1771,7 +1771,13 @@ static void setup_msrs(struct vcpu_vmx *vmx) guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP) || guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDPID)); - vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, true); + /* + * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new + * kernel and old userspace. If those guests run on a tsx=off host, do + * allow guests to use TSX_CTRL, but don't change the value in hardware + * so that TSX remains always disabled. + */ + vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM)); if (cpu_has_vmx_msr_bitmap()) vmx_update_msr_bitmap(&vmx->vcpu); @@ -6919,23 +6925,15 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) vmx->guest_uret_msrs[i].data = 0; vmx->guest_uret_msrs[i].mask = -1ull; } - tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); - if (tsx_ctrl) { + if (boot_cpu_has(X86_FEATURE_RTM)) { /* * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception. * Keep the host value unchanged to avoid changing CPUID bits * under the host kernel's feet. - * - * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations - * of new kernel and old userspace. If those guests run on a - * tsx=off host, do allow guests to use TSX_CTRL, but do not - * change the value on the host so that TSX remains always - * disabled. */ - if (boot_cpu_has(X86_FEATURE_RTM)) + tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); + if (tsx_ctrl) vmx->guest_uret_msrs[i].mask = ~(u64)TSX_CTRL_CPUID_CLEAR; - else - vmx->guest_uret_msrs[i].mask = 0; } err = alloc_loaded_vmcs(&vmx->vmcs01); From 9cc39a5a43c05f8eda206bf9e144119820ecf5c8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 4 May 2021 10:17:31 -0700 Subject: [PATCH 051/263] KVM: x86: Export the number of uret MSRs to vendor modules Split out and export the number of configured user return MSRs so that VMX can iterate over the set of MSRs without having to do its own tracking. Keep the list itself internal to x86 so that vendor code still has to go through the "official" APIs to add/modify entries. Signed-off-by: Sean Christopherson Message-Id: <20210504171734.1434054-13-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 29 +++++++++++++---------------- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6b15f27f49d01..22505e74c3dac 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1418,6 +1418,7 @@ struct kvm_arch_async_pf { bool direct_map; }; +extern u32 __read_mostly kvm_nr_uret_msrs; extern u64 __read_mostly host_efer; extern bool __read_mostly allow_smaller_maxphyaddr; extern struct kvm_x86_ops kvm_x86_ops; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d514031ed25f8..5e1deed8ea5d2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -184,11 +184,6 @@ module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR); */ #define KVM_MAX_NR_USER_RETURN_MSRS 16 -struct kvm_user_return_msrs_global { - int nr; - u32 msrs[KVM_MAX_NR_USER_RETURN_MSRS]; -}; - struct kvm_user_return_msrs { struct user_return_notifier urn; bool registered; @@ -198,7 +193,9 @@ struct kvm_user_return_msrs { } values[KVM_MAX_NR_USER_RETURN_MSRS]; }; -static struct kvm_user_return_msrs_global __read_mostly user_return_msrs_global; +u32 __read_mostly kvm_nr_uret_msrs; +EXPORT_SYMBOL_GPL(kvm_nr_uret_msrs); +static u32 __read_mostly kvm_uret_msrs_list[KVM_MAX_NR_USER_RETURN_MSRS]; static struct kvm_user_return_msrs __percpu *user_return_msrs; #define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ @@ -330,10 +327,10 @@ static void kvm_on_user_return(struct user_return_notifier *urn) user_return_notifier_unregister(urn); } local_irq_restore(flags); - for (slot = 0; slot < user_return_msrs_global.nr; ++slot) { + for (slot = 0; slot < kvm_nr_uret_msrs; ++slot) { values = &msrs->values[slot]; if (values->host != values->curr) { - wrmsrl(user_return_msrs_global.msrs[slot], values->host); + wrmsrl(kvm_uret_msrs_list[slot], values->host); values->curr = values->host; } } @@ -358,9 +355,9 @@ EXPORT_SYMBOL_GPL(kvm_probe_user_return_msr); void kvm_define_user_return_msr(unsigned slot, u32 msr) { BUG_ON(slot >= KVM_MAX_NR_USER_RETURN_MSRS); - user_return_msrs_global.msrs[slot] = msr; - if (slot >= user_return_msrs_global.nr) - user_return_msrs_global.nr = slot + 1; + kvm_uret_msrs_list[slot] = msr; + if (slot >= kvm_nr_uret_msrs) + kvm_nr_uret_msrs = slot + 1; } EXPORT_SYMBOL_GPL(kvm_define_user_return_msr); @@ -368,8 +365,8 @@ int kvm_find_user_return_msr(u32 msr) { int i; - for (i = 0; i < user_return_msrs_global.nr; ++i) { - if (user_return_msrs_global.msrs[i] == msr) + for (i = 0; i < kvm_nr_uret_msrs; ++i) { + if (kvm_uret_msrs_list[i] == msr) return i; } return -1; @@ -383,8 +380,8 @@ static void kvm_user_return_msr_cpu_online(void) u64 value; int i; - for (i = 0; i < user_return_msrs_global.nr; ++i) { - rdmsrl_safe(user_return_msrs_global.msrs[i], &value); + for (i = 0; i < kvm_nr_uret_msrs; ++i) { + rdmsrl_safe(kvm_uret_msrs_list[i], &value); msrs->values[i].host = value; msrs->values[i].curr = value; } @@ -399,7 +396,7 @@ int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask) value = (value & mask) | (msrs->values[slot].host & ~mask); if (value == msrs->values[slot].curr) return 0; - err = wrmsrl_safe(user_return_msrs_global.msrs[slot], value); + err = wrmsrl_safe(kvm_uret_msrs_list[slot], value); if (err) return 1; From e5fda4bbadb053e3b5164476146cf43092785c0b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 4 May 2021 10:17:32 -0700 Subject: [PATCH 052/263] KVM: x86: Move uret MSR slot management to common x86 Now that SVM and VMX both probe MSRs before "defining" user return slots for them, consolidate the code for probe+define into common x86 and eliminate the odd behavior of having the vendor code define the slot for a given MSR. Signed-off-by: Sean Christopherson Message-Id: <20210504171734.1434054-14-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 +-- arch/x86/kvm/svm/svm.c | 5 +---- arch/x86/kvm/vmx/vmx.c | 19 ++++--------------- arch/x86/kvm/x86.c | 19 +++++++++++-------- 4 files changed, 17 insertions(+), 29 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 22505e74c3dac..8155eaac22a7e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1777,9 +1777,8 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, unsigned long ipi_bitmap_high, u32 min, unsigned long icr, int op_64_bit); -void kvm_define_user_return_msr(unsigned index, u32 msr); +int kvm_add_user_return_msr(u32 msr); int kvm_find_user_return_msr(u32 msr); -int kvm_probe_user_return_msr(u32 msr); int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask); u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 0ba0a00f8dc6a..d13b72bfb736b 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -959,10 +959,7 @@ static __init int svm_hardware_setup(void) kvm_tsc_scaling_ratio_frac_bits = 32; } - if (!kvm_probe_user_return_msr(MSR_TSC_AUX)) { - tsc_aux_uret_slot = 0; - kvm_define_user_return_msr(tsc_aux_uret_slot, MSR_TSC_AUX); - } + tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX); /* Check for pause filtering support */ if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) { diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index bd2187b4cc530..042823b492730 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -454,9 +454,6 @@ static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) static unsigned long host_idt_base; -/* Number of user return MSRs that are actually supported in hardware. */ -static int vmx_nr_uret_msrs; - #if IS_ENABLED(CONFIG_HYPERV) static bool __read_mostly enlightened_vmcs = true; module_param(enlightened_vmcs, bool, 0444); @@ -1218,7 +1215,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) */ if (!vmx->guest_uret_msrs_loaded) { vmx->guest_uret_msrs_loaded = true; - for (i = 0; i < vmx_nr_uret_msrs; ++i) { + for (i = 0; i < kvm_nr_uret_msrs; ++i) { if (!vmx->guest_uret_msrs[i].load_into_hardware) continue; @@ -6921,7 +6918,7 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) goto free_vpid; } - for (i = 0; i < vmx_nr_uret_msrs; ++i) { + for (i = 0; i < kvm_nr_uret_msrs; ++i) { vmx->guest_uret_msrs[i].data = 0; vmx->guest_uret_msrs[i].mask = -1ull; } @@ -7810,20 +7807,12 @@ static __init void vmx_setup_user_return_msrs(void) MSR_EFER, MSR_TSC_AUX, MSR_STAR, MSR_IA32_TSX_CTRL, }; - u32 msr; int i; BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS); - for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) { - msr = vmx_uret_msrs_list[i]; - - if (kvm_probe_user_return_msr(msr)) - continue; - - kvm_define_user_return_msr(vmx_nr_uret_msrs, msr); - vmx_nr_uret_msrs++; - } + for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) + kvm_add_user_return_msr(vmx_uret_msrs_list[i]); } static __init int hardware_setup(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5e1deed8ea5d2..0a0eebf35dd53 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -336,7 +336,7 @@ static void kvm_on_user_return(struct user_return_notifier *urn) } } -int kvm_probe_user_return_msr(u32 msr) +static int kvm_probe_user_return_msr(u32 msr) { u64 val; int ret; @@ -350,16 +350,18 @@ int kvm_probe_user_return_msr(u32 msr) preempt_enable(); return ret; } -EXPORT_SYMBOL_GPL(kvm_probe_user_return_msr); -void kvm_define_user_return_msr(unsigned slot, u32 msr) +int kvm_add_user_return_msr(u32 msr) { - BUG_ON(slot >= KVM_MAX_NR_USER_RETURN_MSRS); - kvm_uret_msrs_list[slot] = msr; - if (slot >= kvm_nr_uret_msrs) - kvm_nr_uret_msrs = slot + 1; + BUG_ON(kvm_nr_uret_msrs >= KVM_MAX_NR_USER_RETURN_MSRS); + + if (kvm_probe_user_return_msr(msr)) + return -1; + + kvm_uret_msrs_list[kvm_nr_uret_msrs] = msr; + return kvm_nr_uret_msrs++; } -EXPORT_SYMBOL_GPL(kvm_define_user_return_msr); +EXPORT_SYMBOL_GPL(kvm_add_user_return_msr); int kvm_find_user_return_msr(u32 msr) { @@ -8132,6 +8134,7 @@ int kvm_arch_init(void *opaque) printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n"); goto out_free_x86_emulator_cache; } + kvm_nr_uret_msrs = 0; r = kvm_mmu_module_init(); if (r) From 61a05d444d2ca8d40add453a5f7058fbb1b57eca Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 4 May 2021 10:17:33 -0700 Subject: [PATCH 053/263] KVM: x86: Tie Intel and AMD behavior for MSR_TSC_AUX to guest CPU model Squish the Intel and AMD emulation of MSR_TSC_AUX together and tie it to the guest CPU model instead of the host CPU behavior. While not strictly necessary to avoid guest breakage, emulating cross-vendor "architecture" will provide consistent behavior for the guest, e.g. WRMSR fault behavior won't change if the vCPU is migrated to a host with divergent behavior. Note, the "new" kvm_is_supported_user_return_msr() checks do not add new functionality on either SVM or VMX. On SVM, the equivalent was "tsc_aux_uret_slot < 0", and on VMX the check was buried in the vmx_find_uret_msr() call at the find_uret_msr label. Signed-off-by: Sean Christopherson Message-Id: <20210504171734.1434054-15-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 5 +++++ arch/x86/kvm/svm/svm.c | 24 ---------------------- arch/x86/kvm/vmx/vmx.c | 15 -------------- arch/x86/kvm/x86.c | 36 +++++++++++++++++++++++++++++++++ 4 files changed, 41 insertions(+), 39 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8155eaac22a7e..f85480b1d215f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1781,6 +1781,11 @@ int kvm_add_user_return_msr(u32 msr); int kvm_find_user_return_msr(u32 msr); int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask); +static inline bool kvm_is_supported_user_return_msr(u32 msr) +{ + return kvm_find_user_return_msr(msr) >= 0; +} + u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc); u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d13b72bfb736b..0922d8e173e61 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2663,12 +2663,6 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data |= (u64)svm->sysenter_esp_hi << 32; break; case MSR_TSC_AUX: - if (tsc_aux_uret_slot < 0) - return 1; - if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) && - !guest_cpuid_has(vcpu, X86_FEATURE_RDPID)) - return 1; msr_info->data = svm->tsc_aux; break; /* @@ -2885,24 +2879,6 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0; break; case MSR_TSC_AUX: - if (tsc_aux_uret_slot < 0) - return 1; - - if (!msr->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) && - !guest_cpuid_has(vcpu, X86_FEATURE_RDPID)) - return 1; - - /* - * Per Intel's SDM, bits 63:32 are reserved, but AMD's APM has - * incomplete and conflicting architectural behavior. Current - * AMD CPUs completely ignore bits 63:32, i.e. they aren't - * reserved and always read as zeros. Emulate AMD CPU behavior - * to avoid explosions if the vCPU is migrated from an AMD host - * to an Intel host. - */ - data = (u32)data; - /* * TSC_AUX is usually changed only during boot and never read * directly. Intercept TSC_AUX instead of exposing it to the diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 042823b492730..61f7e5221bf3a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1981,12 +1981,6 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) else msr_info->data = vmx->pt_desc.guest.addr_a[index / 2]; break; - case MSR_TSC_AUX: - if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) && - !guest_cpuid_has(vcpu, X86_FEATURE_RDPID)) - return 1; - goto find_uret_msr; case MSR_IA32_DEBUGCTLMSR: msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL); break; @@ -2302,15 +2296,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) else vmx->pt_desc.guest.addr_a[index / 2] = data; break; - case MSR_TSC_AUX: - if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) && - !guest_cpuid_has(vcpu, X86_FEATURE_RDPID)) - return 1; - /* Check reserved bit, higher 32 bits should be zero */ - if ((data >> 32) != 0) - return 1; - goto find_uret_msr; case MSR_IA32_PERF_CAPABILITIES: if (data && !vcpu_to_pmu(vcpu)->version) return 1; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0a0eebf35dd53..4efd2978ec089 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1642,6 +1642,30 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, * invokes 64-bit SYSENTER. */ data = get_canonical(data, vcpu_virt_addr_bits(vcpu)); + break; + case MSR_TSC_AUX: + if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX)) + return 1; + + if (!host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) && + !guest_cpuid_has(vcpu, X86_FEATURE_RDPID)) + return 1; + + /* + * Per Intel's SDM, bits 63:32 are reserved, but AMD's APM has + * incomplete and conflicting architectural behavior. Current + * AMD CPUs completely ignore bits 63:32, i.e. they aren't + * reserved and always read as zeros. Enforce Intel's reserved + * bits check if and only if the guest CPU is Intel, and clear + * the bits in all other cases. This ensures cross-vendor + * migration will provide consistent behavior for the guest. + */ + if (guest_cpuid_is_intel(vcpu) && (data >> 32) != 0) + return 1; + + data = (u32)data; + break; } msr.data = data; @@ -1678,6 +1702,18 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ)) return KVM_MSR_RET_FILTERED; + switch (index) { + case MSR_TSC_AUX: + if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX)) + return 1; + + if (!host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) && + !guest_cpuid_has(vcpu, X86_FEATURE_RDPID)) + return 1; + break; + } + msr.index = index; msr.host_initiated = host_initiated; From 78bba966ee3cdbbfc585d8e39237378fba50a142 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 4 May 2021 10:17:34 -0700 Subject: [PATCH 054/263] KVM: x86: Hide RDTSCP and RDPID if MSR_TSC_AUX probing failed If probing MSR_TSC_AUX failed, hide RDTSCP and RDPID, and WARN if either feature was reported as supported. In theory, such a scenario should never happen as both Intel and AMD state that MSR_TSC_AUX is available if RDTSCP or RDPID is supported. But, KVM injects #GP on MSR_TSC_AUX accesses if probing failed, faults on WRMSR(MSR_TSC_AUX) may be fatal to the guest (because they happen during early CPU bringup), and KVM itself has effectively misreported RDPID support in the past. Note, this also has the happy side effect of omitting MSR_TSC_AUX from the list of MSRs that are exposed to userspace if probing the MSR fails. Signed-off-by: Sean Christopherson Message-Id: <20210504171734.1434054-16-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index c0e8c5e921898..f42e9491a6c8c 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -567,6 +567,21 @@ void kvm_set_cpu_caps(void) F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) | F(PMM) | F(PMM_EN) ); + + /* + * Hide RDTSCP and RDPID if either feature is reported as supported but + * probing MSR_TSC_AUX failed. This is purely a sanity check and + * should never happen, but the guest will likely crash if RDTSCP or + * RDPID is misreported, and KVM has botched MSR_TSC_AUX emulation in + * the past. For example, the sanity check may fire if this instance of + * KVM is running as L1 on top of an older, broken KVM. + */ + if (WARN_ON((kvm_cpu_cap_has(X86_FEATURE_RDTSCP) || + kvm_cpu_cap_has(X86_FEATURE_RDPID)) && + !kvm_is_supported_user_return_msr(MSR_TSC_AUX))) { + kvm_cpu_cap_clear(X86_FEATURE_RDTSCP); + kvm_cpu_cap_clear(X86_FEATURE_RDPID); + } } EXPORT_SYMBOL_GPL(kvm_set_cpu_caps); From 34114136f725cbd0c83e7b5a0c8a977976cd82f7 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 5 May 2021 22:15:09 +1000 Subject: [PATCH 055/263] KVM: PPC: Book3S HV: Fix conversion to gfn-based MMU notifier callbacks Commit b1c5356e873c ("KVM: PPC: Convert to the gfn-based MMU notifier callbacks") causes unmap_gfn_range and age_gfn callbacks to only work on the first gfn in the range. It also makes the aging callbacks call into both radix and hash aging functions for radix guests. Fix this. Add warnings for the single-gfn calls that have been converted to range callbacks, in case they ever receieve ranges greater than 1. Fixes: b1c5356e873c ("KVM: PPC: Convert to the gfn-based MMU notifier callbacks") Reported-by: Bharata B Rao Tested-by: Bharata B Rao Signed-off-by: Nicholas Piggin Message-Id: <20210505121509.1470207-1-npiggin@gmail.com> Signed-off-by: Paolo Bonzini --- arch/powerpc/include/asm/kvm_book3s.h | 2 +- arch/powerpc/kvm/book3s_64_mmu_hv.c | 46 ++++++++++++++++++-------- arch/powerpc/kvm/book3s_64_mmu_radix.c | 5 ++- 3 files changed, 36 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index a6e9a5585e618..e6b53c6e21e32 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -210,7 +210,7 @@ extern void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, unsigned int lpid); extern int kvmppc_radix_init(void); extern void kvmppc_radix_exit(void); -extern bool kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, +extern void kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn); extern bool kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn); diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index b7bd9ca040b85..2d9193cd73be4 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -795,7 +795,7 @@ static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i, } } -static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, +static void kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn) { unsigned long i; @@ -829,15 +829,21 @@ static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, unlock_rmap(rmapp); __unlock_hpte(hptep, be64_to_cpu(hptep[0])); } - return false; } bool kvm_unmap_gfn_range_hv(struct kvm *kvm, struct kvm_gfn_range *range) { - if (kvm_is_radix(kvm)) - return kvm_unmap_radix(kvm, range->slot, range->start); + gfn_t gfn; + + if (kvm_is_radix(kvm)) { + for (gfn = range->start; gfn < range->end; gfn++) + kvm_unmap_radix(kvm, range->slot, gfn); + } else { + for (gfn = range->start; gfn < range->end; gfn++) + kvm_unmap_rmapp(kvm, range->slot, range->start); + } - return kvm_unmap_rmapp(kvm, range->slot, range->start); + return false; } void kvmppc_core_flush_memslot_hv(struct kvm *kvm, @@ -924,10 +930,18 @@ static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, bool kvm_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range) { - if (kvm_is_radix(kvm)) - kvm_age_radix(kvm, range->slot, range->start); + gfn_t gfn; + bool ret = false; - return kvm_age_rmapp(kvm, range->slot, range->start); + if (kvm_is_radix(kvm)) { + for (gfn = range->start; gfn < range->end; gfn++) + ret |= kvm_age_radix(kvm, range->slot, gfn); + } else { + for (gfn = range->start; gfn < range->end; gfn++) + ret |= kvm_age_rmapp(kvm, range->slot, gfn); + } + + return ret; } static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, @@ -965,18 +979,24 @@ static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, bool kvm_test_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range) { - if (kvm_is_radix(kvm)) - kvm_test_age_radix(kvm, range->slot, range->start); + WARN_ON(range->start + 1 != range->end); - return kvm_test_age_rmapp(kvm, range->slot, range->start); + if (kvm_is_radix(kvm)) + return kvm_test_age_radix(kvm, range->slot, range->start); + else + return kvm_test_age_rmapp(kvm, range->slot, range->start); } bool kvm_set_spte_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range) { + WARN_ON(range->start + 1 != range->end); + if (kvm_is_radix(kvm)) - return kvm_unmap_radix(kvm, range->slot, range->start); + kvm_unmap_radix(kvm, range->slot, range->start); + else + kvm_unmap_rmapp(kvm, range->slot, range->start); - return kvm_unmap_rmapp(kvm, range->slot, range->start); + return false; } static int vcpus_running(struct kvm *kvm) diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index ec4f58fa9f5a2..d909c069363e0 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -993,7 +993,7 @@ int kvmppc_book3s_radix_page_fault(struct kvm_vcpu *vcpu, } /* Called with kvm->mmu_lock held */ -bool kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, +void kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn) { pte_t *ptep; @@ -1002,14 +1002,13 @@ bool kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) { uv_page_inval(kvm->arch.lpid, gpa, PAGE_SHIFT); - return false; + return; } ptep = find_kvm_secondary_pte(kvm, gpa, &shift); if (ptep && pte_present(*ptep)) kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot, kvm->arch.lpid); - return false; } /* Called with kvm->mmu_lock held */ From e8ea85fb280ec55674bca88ea7cd85f60d19567f Mon Sep 17 00:00:00 2001 From: Chenyi Qiang Date: Tue, 2 Feb 2021 17:04:32 +0800 Subject: [PATCH 056/263] KVM: X86: Add support for the emulation of DR6_BUS_LOCK bit Bus lock debug exception introduces a new bit DR6_BUS_LOCK (bit 11 of DR6) to indicate that bus lock #DB exception is generated. The set/clear of DR6_BUS_LOCK is similar to the DR6_RTM. The processor clears DR6_BUS_LOCK when the exception is generated. For all other #DB, the processor sets this bit to 1. Software #DB handler should set this bit before returning to the interrupted task. In VMM, to avoid breaking the CPUs without bus lock #DB exception support, activate the DR6_BUS_LOCK conditionally in DR6_FIXED_1 bits. When intercepting the #DB exception caused by bus locks, bit 11 of the exit qualification is set to identify it. The VMM should emulate the exception by clearing the bit 11 of the guest DR6. Co-developed-by: Xiaoyao Li Signed-off-by: Xiaoyao Li Signed-off-by: Chenyi Qiang Message-Id: <20210202090433.13441-3-chenyi.qiang@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 ++- arch/x86/kvm/x86.c | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f85480b1d215f..8836b30962178 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -200,6 +200,7 @@ enum x86_intercept_stage; #define KVM_NR_DB_REGS 4 +#define DR6_BUS_LOCK (1 << 11) #define DR6_BD (1 << 13) #define DR6_BS (1 << 14) #define DR6_BT (1 << 15) @@ -213,7 +214,7 @@ enum x86_intercept_stage; * DR6_ACTIVE_LOW is also used as the init/reset value for DR6. */ #define DR6_ACTIVE_LOW 0xffff0ff0 -#define DR6_VOLATILE 0x0001e00f +#define DR6_VOLATILE 0x0001e80f #define DR6_FIXED_1 (DR6_ACTIVE_LOW & ~DR6_VOLATILE) #define DR7_BP_EN_MASK 0x000000ff diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4efd2978ec089..9b1a7040eae31 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1176,6 +1176,9 @@ static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu) if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM)) fixed |= DR6_RTM; + + if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)) + fixed |= DR6_BUS_LOCK; return fixed; } From 76ea438b4afcd9ee8da3387e9af4625eaccff58f Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 6 May 2021 06:30:04 -0400 Subject: [PATCH 057/263] KVM: X86: Expose bus lock debug exception to guest Bus lock debug exception is an ability to notify the kernel by an #DB trap after the instruction acquires a bus lock and is executed when CPL>0. This allows the kernel to enforce user application throttling or mitigations. Existence of bus lock debug exception is enumerated via CPUID.(EAX=7,ECX=0).ECX[24]. Software can enable these exceptions by setting bit 2 of the MSR_IA32_DEBUGCTL. Expose the CPUID to guest and emulate the MSR handling when guest enables it. Support for this feature was originally developed by Xiaoyao Li and Chenyi Qiang, but code has since changed enough that this patch has nothing in common with theirs, except for this commit message. Co-developed-by: Xiaoyao Li Signed-off-by: Xiaoyao Li Signed-off-by: Chenyi Qiang Message-Id: <20210202090433.13441-4-chenyi.qiang@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 2 +- arch/x86/kvm/vmx/capabilities.h | 3 +++ arch/x86/kvm/vmx/vmx.c | 3 +++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index f42e9491a6c8c..9a48f138832d4 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -458,7 +458,7 @@ void kvm_set_cpu_caps(void) F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ | - F(SGX_LC) + F(SGX_LC) | F(BUS_LOCK_DETECT) ); /* Set LA57 based on hardware capability. */ if (cpuid_ecx(7) & F(LA57)) diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index d1d77985e889f..8dee8a5fbc17f 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -398,6 +398,9 @@ static inline u64 vmx_supported_debugctl(void) { u64 debugctl = 0; + if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) + debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT; + if (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT) debugctl |= DEBUGCTLMSR_LBR_MASK; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 61f7e5221bf3a..f2fd447eed459 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2014,6 +2014,9 @@ static u64 vcpu_supported_debugctl(struct kvm_vcpu *vcpu) if (!intel_pmu_lbr_is_enabled(vcpu)) debugctl &= ~DEBUGCTLMSR_LBR_MASK; + if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)) + debugctl &= ~DEBUGCTLMSR_BUS_LOCK_DETECT; + return debugctl; } From 03ca4589fabcc66b27e4cb8f8e95d64cf43badd0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 5 May 2021 13:42:21 -0700 Subject: [PATCH 058/263] KVM: x86: Prevent KVM SVM from loading on kernels with 5-level paging Disallow loading KVM SVM if 5-level paging is supported. In theory, NPT for L1 should simply work, but there unknowns with respect to how the guest's MAXPHYADDR will be handled by hardware. Nested NPT is more problematic, as running an L1 VMM that is using 2-level page tables requires stacking single-entry PDP and PML4 tables in KVM's NPT for L2, as there are no equivalent entries in L1's NPT to shadow. Barring hardware magic, for 5-level paging, KVM would need stack another layer to handle PML5. Opportunistically rename the lm_root pointer, which is used for the aforementioned stacking when shadowing 2-level L1 NPT, to pml4_root to call out that it's specifically for PML4. Suggested-by: Paolo Bonzini Signed-off-by: Sean Christopherson Message-Id: <20210505204221.1934471-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/mmu/mmu.c | 20 ++++++++++---------- arch/x86/kvm/svm/svm.c | 5 +++++ 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8836b30962178..55efbacfc2445 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -409,7 +409,7 @@ struct kvm_mmu { u32 pkru_mask; u64 *pae_root; - u64 *lm_root; + u64 *pml4_root; /* * check zero bits on shadow page table entries, these diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 4b3ee244ebe05..0144c40d09c76 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3310,12 +3310,12 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) { pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; - if (WARN_ON_ONCE(!mmu->lm_root)) { + if (WARN_ON_ONCE(!mmu->pml4_root)) { r = -EIO; goto out_unlock; } - mmu->lm_root[0] = __pa(mmu->pae_root) | pm_mask; + mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask; } for (i = 0; i < 4; ++i) { @@ -3335,7 +3335,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) } if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) - mmu->root_hpa = __pa(mmu->lm_root); + mmu->root_hpa = __pa(mmu->pml4_root); else mmu->root_hpa = __pa(mmu->pae_root); @@ -3350,7 +3350,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.mmu; - u64 *lm_root, *pae_root; + u64 *pml4_root, *pae_root; /* * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP @@ -3369,14 +3369,14 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu) if (WARN_ON_ONCE(mmu->shadow_root_level != PT64_ROOT_4LEVEL)) return -EIO; - if (mmu->pae_root && mmu->lm_root) + if (mmu->pae_root && mmu->pml4_root) return 0; /* * The special roots should always be allocated in concert. Yell and * bail if KVM ends up in a state where only one of the roots is valid. */ - if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->lm_root)) + if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root)) return -EIO; /* @@ -3387,14 +3387,14 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu) if (!pae_root) return -ENOMEM; - lm_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); - if (!lm_root) { + pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); + if (!pml4_root) { free_page((unsigned long)pae_root); return -ENOMEM; } mmu->pae_root = pae_root; - mmu->lm_root = lm_root; + mmu->pml4_root = pml4_root; return 0; } @@ -5261,7 +5261,7 @@ static void free_mmu_pages(struct kvm_mmu *mmu) if (!tdp_enabled && mmu->pae_root) set_memory_encrypted((unsigned long)mmu->pae_root, 1); free_page((unsigned long)mmu->pae_root); - free_page((unsigned long)mmu->lm_root); + free_page((unsigned long)mmu->pml4_root); } static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 0922d8e173e61..8124f51e9488f 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -447,6 +447,11 @@ static int has_svm(void) return 0; } + if (pgtable_l5_enabled()) { + pr_info("KVM doesn't yet support 5-level paging on AMD SVM\n"); + return 0; + } + return 1; } From 594b27e677b35f9734b1969d175ebc6146741109 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 5 May 2021 23:48:17 +0200 Subject: [PATCH 059/263] KVM: x86: Cancel pvclock_gtod_work on module removal Nothing prevents the following: pvclock_gtod_notify() queue_work(system_long_wq, &pvclock_gtod_work); ... remove_module(kvm); ... work_queue_run() pvclock_gtod_work() <- UAF Ditto for any other operation on that workqueue list head which touches pvclock_gtod_work after module removal. Cancel the work in kvm_arch_exit() to prevent that. Fixes: 16e8d74d2da9 ("KVM: x86: notifier for clocksource changes") Signed-off-by: Thomas Gleixner Message-Id: <87czu4onry.ffs@nanos.tec.linutronix.de> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9b1a7040eae31..259139a145cbb 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8224,6 +8224,7 @@ void kvm_arch_exit(void) cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE); #ifdef CONFIG_X86_64 pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier); + cancel_work_sync(&pvclock_gtod_work); #endif kvm_x86_ops.hardware_enable = NULL; kvm_mmu_module_exit(); From 3f804f6d201ca93adf4c3df04d1bfd152c1129d6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 6 May 2021 15:21:37 +0200 Subject: [PATCH 060/263] KVM: x86: Prevent deadlock against tk_core.seq syzbot reported a possible deadlock in pvclock_gtod_notify(): CPU 0 CPU 1 write_seqcount_begin(&tk_core.seq); pvclock_gtod_notify() spin_lock(&pool->lock); queue_work(..., &pvclock_gtod_work) ktime_get() spin_lock(&pool->lock); do { seq = read_seqcount_begin(tk_core.seq) ... } while (read_seqcount_retry(&tk_core.seq, seq); While this is unlikely to happen, it's possible. Delegate queue_work() to irq_work() which postpones it until the tk_core.seq write held region is left and interrupts are reenabled. Fixes: 16e8d74d2da9 ("KVM: x86: notifier for clocksource changes") Reported-by: syzbot+6beae4000559d41d80f8@syzkaller.appspotmail.com Signed-off-by: Thomas Gleixner Message-Id: <87h7jgm1zy.ffs@nanos.tec.linutronix.de> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 259139a145cbb..5bd550eaf6833 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8094,6 +8094,18 @@ static void pvclock_gtod_update_fn(struct work_struct *work) static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn); +/* + * Indirection to move queue_work() out of the tk_core.seq write held + * region to prevent possible deadlocks against time accessors which + * are invoked with work related locks held. + */ +static void pvclock_irq_work_fn(struct irq_work *w) +{ + queue_work(system_long_wq, &pvclock_gtod_work); +} + +static DEFINE_IRQ_WORK(pvclock_irq_work, pvclock_irq_work_fn); + /* * Notification about pvclock gtod data update. */ @@ -8105,13 +8117,14 @@ static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused, update_pvclock_gtod(tk); - /* disable master clock if host does not trust, or does not - * use, TSC based clocksource. + /* + * Disable master clock if host does not trust, or does not use, + * TSC based clocksource. Delegate queue_work() to irq_work as + * this is invoked with tk_core.seq write held. */ if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) && atomic_read(&kvm_guest_has_master_clock) != 0) - queue_work(system_long_wq, &pvclock_gtod_work); - + irq_work_queue(&pvclock_irq_work); return 0; } @@ -8224,6 +8237,7 @@ void kvm_arch_exit(void) cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE); #ifdef CONFIG_X86_64 pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier); + irq_work_sync(&pvclock_irq_work); cancel_work_sync(&pvclock_gtod_work); #endif kvm_x86_ops.hardware_enable = NULL; From b26990987ffce0525abbd84b36595869cfdbbfe6 Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Thu, 6 May 2021 16:03:52 +0200 Subject: [PATCH 061/263] tools/kvm_stat: Fix documentation typo Makes the dash in front of option '-z' disappear in the generated man-page. Signed-off-by: Stefan Raspl Message-Id: <20210506140352.4178789-1-raspl@linux.ibm.com> Signed-off-by: Paolo Bonzini --- tools/kvm/kvm_stat/kvm_stat.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/kvm/kvm_stat/kvm_stat.txt b/tools/kvm/kvm_stat/kvm_stat.txt index feaf46451e838..3a9f2037bd23f 100644 --- a/tools/kvm/kvm_stat/kvm_stat.txt +++ b/tools/kvm/kvm_stat/kvm_stat.txt @@ -111,7 +111,7 @@ OPTIONS --tracepoints:: retrieve statistics from tracepoints -*z*:: +-z:: --skip-zero-records:: omit records with all zeros in logging mode From 258785ef08b323bddd844b4926a32c2b2045a1b0 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Thu, 6 May 2021 15:24:43 +0000 Subject: [PATCH 062/263] kvm: Cap halt polling at kvm->max_halt_poll_ns When growing halt-polling, there is no check that the poll time exceeds the per-VM limit. It's possible for vcpu->halt_poll_ns to grow past kvm->max_halt_poll_ns and stay there until a halt which takes longer than kvm->halt_poll_ns. Signed-off-by: David Matlack Signed-off-by: Venkatesh Srinivas Message-Id: <20210506152442.4010298-1-venkateshs@chromium.org> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index b9f12da6af0ea..6b4feb92dc797 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2893,8 +2893,8 @@ static void grow_halt_poll_ns(struct kvm_vcpu *vcpu) if (val < grow_start) val = grow_start; - if (val > halt_poll_ns) - val = halt_poll_ns; + if (val > vcpu->kvm->max_halt_poll_ns) + val = vcpu->kvm->max_halt_poll_ns; vcpu->halt_poll_ns = val; out: From 368340a3c7d9a207bfe544721d464b7109be8eae Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 6 May 2021 16:15:42 -0700 Subject: [PATCH 063/263] KVM: SVM: Invert user pointer casting in SEV {en,de}crypt helpers Invert the user pointer params for SEV's helpers for encrypting and decrypting guest memory so that they take a pointer and cast to an unsigned long as necessary, as opposed to doing the opposite. Tagging a non-pointer as __user is confusing and weird since a cast of some form needs to occur to actually access the user data. This also fixes Sparse warnings triggered by directly consuming the unsigned longs, which are "noderef" due to the __user tag. Cc: Brijesh Singh Cc: Tom Lendacky Cc: Ashish Kalra Signed-off-by: Sean Christopherson Message-Id: <20210506231542.2331138-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 8b11c711a0e40..eb241c1a4add3 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -763,7 +763,7 @@ static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr, } static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr, - unsigned long __user dst_uaddr, + void __user *dst_uaddr, unsigned long dst_paddr, int size, int *err) { @@ -787,8 +787,7 @@ static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr, if (tpage) { offset = paddr & 15; - if (copy_to_user((void __user *)(uintptr_t)dst_uaddr, - page_address(tpage) + offset, size)) + if (copy_to_user(dst_uaddr, page_address(tpage) + offset, size)) ret = -EFAULT; } @@ -800,9 +799,9 @@ static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr, } static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr, - unsigned long __user vaddr, + void __user *vaddr, unsigned long dst_paddr, - unsigned long __user dst_vaddr, + void __user *dst_vaddr, int size, int *error) { struct page *src_tpage = NULL; @@ -810,13 +809,12 @@ static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr, int ret, len = size; /* If source buffer is not aligned then use an intermediate buffer */ - if (!IS_ALIGNED(vaddr, 16)) { + if (!IS_ALIGNED((unsigned long)vaddr, 16)) { src_tpage = alloc_page(GFP_KERNEL); if (!src_tpage) return -ENOMEM; - if (copy_from_user(page_address(src_tpage), - (void __user *)(uintptr_t)vaddr, size)) { + if (copy_from_user(page_address(src_tpage), vaddr, size)) { __free_page(src_tpage); return -EFAULT; } @@ -830,7 +828,7 @@ static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr, * - copy the source buffer in an intermediate buffer * - use the intermediate buffer as source buffer */ - if (!IS_ALIGNED(dst_vaddr, 16) || !IS_ALIGNED(size, 16)) { + if (!IS_ALIGNED((unsigned long)dst_vaddr, 16) || !IS_ALIGNED(size, 16)) { int dst_offset; dst_tpage = alloc_page(GFP_KERNEL); @@ -855,7 +853,7 @@ static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr, page_address(src_tpage), size); else { if (copy_from_user(page_address(dst_tpage) + dst_offset, - (void __user *)(uintptr_t)vaddr, size)) { + vaddr, size)) { ret = -EFAULT; goto e_free; } @@ -935,15 +933,15 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec) if (dec) ret = __sev_dbg_decrypt_user(kvm, __sme_page_pa(src_p[0]) + s_off, - dst_vaddr, + (void __user *)dst_vaddr, __sme_page_pa(dst_p[0]) + d_off, len, &argp->error); else ret = __sev_dbg_encrypt_user(kvm, __sme_page_pa(src_p[0]) + s_off, - vaddr, + (void __user *)vaddr, __sme_page_pa(dst_p[0]) + d_off, - dst_vaddr, + (void __user *)dst_vaddr, len, &argp->error); sev_unpin_memory(kvm, src_p, n); From ce7ea0cfdc2e9ff31d12da31c3226deddb9644f5 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Thu, 6 May 2021 15:14:41 -0500 Subject: [PATCH 064/263] KVM: SVM: Move GHCB unmapping to fix RCU warning When an SEV-ES guest is running, the GHCB is unmapped as part of the vCPU run support. However, kvm_vcpu_unmap() triggers an RCU dereference warning with CONFIG_PROVE_LOCKING=y because the SRCU lock is released before invoking the vCPU run support. Move the GHCB unmapping into the prepare_guest_switch callback, which is invoked while still holding the SRCU lock, eliminating the RCU dereference warning. Fixes: 291bd20d5d88 ("KVM: SVM: Add initial support for a VMGEXIT VMEXIT") Reported-by: Borislav Petkov Signed-off-by: Tom Lendacky Message-Id: Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 5 +---- arch/x86/kvm/svm/svm.c | 3 +++ arch/x86/kvm/svm/svm.h | 1 + 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index eb241c1a4add3..5bc887e9a9860 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2197,7 +2197,7 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) return -EINVAL; } -static void pre_sev_es_run(struct vcpu_svm *svm) +void sev_es_unmap_ghcb(struct vcpu_svm *svm) { if (!svm->ghcb) return; @@ -2233,9 +2233,6 @@ void pre_sev_run(struct vcpu_svm *svm, int cpu) struct svm_cpu_data *sd = per_cpu(svm_data, cpu); int asid = sev_get_asid(svm->vcpu.kvm); - /* Perform any SEV-ES pre-run actions */ - pre_sev_es_run(svm); - /* Assign the asid allocated with this SEV guest */ svm->asid = asid; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 8124f51e9488f..4dd9b7856e5b1 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1437,6 +1437,9 @@ static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); + if (sev_es_guest(vcpu->kvm)) + sev_es_unmap_ghcb(svm); + if (svm->guest_state_loaded) return; diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 84b3133c2251d..e44567ceb8655 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -581,6 +581,7 @@ void sev_es_init_vmcb(struct vcpu_svm *svm); void sev_es_create_vcpu(struct vcpu_svm *svm); void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu); +void sev_es_unmap_ghcb(struct vcpu_svm *svm); /* vmenter.S */ From 698ab77aebffe08b312fbcdddeb0e8bd08b78717 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 28 Apr 2021 15:03:12 -0400 Subject: [PATCH 065/263] dax: Add an enum for specifying dax wakup mode Dan mentioned that he is not very fond of passing around a boolean true/false to specify if only next waiter should be woken up or all waiters should be woken up. He instead prefers that we introduce an enum and make it very explicity at the callsite itself. Easier to read code. This patch should not introduce any change of behavior. Reviewed-by: Greg Kurz Reviewed-by: Jan Kara Suggested-by: Dan Williams Signed-off-by: Vivek Goyal Link: https://lore.kernel.org/r/20210428190314.1865312-2-vgoyal@redhat.com Signed-off-by: Dan Williams --- fs/dax.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index b3d27fdc67752..5ecee51c44ee7 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -144,6 +144,16 @@ struct wait_exceptional_entry_queue { struct exceptional_entry_key key; }; +/** + * enum dax_wake_mode: waitqueue wakeup behaviour + * @WAKE_ALL: wake all waiters in the waitqueue + * @WAKE_NEXT: wake only the first waiter in the waitqueue + */ +enum dax_wake_mode { + WAKE_ALL, + WAKE_NEXT, +}; + static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas, void *entry, struct exceptional_entry_key *key) { @@ -182,7 +192,8 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait, * The important information it's conveying is whether the entry at * this index used to be a PMD entry. */ -static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all) +static void dax_wake_entry(struct xa_state *xas, void *entry, + enum dax_wake_mode mode) { struct exceptional_entry_key key; wait_queue_head_t *wq; @@ -196,7 +207,7 @@ static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all) * must be in the waitqueue and the following check will see them. */ if (waitqueue_active(wq)) - __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); + __wake_up(wq, TASK_NORMAL, mode == WAKE_ALL ? 0 : 1, &key); } /* @@ -268,7 +279,7 @@ static void put_unlocked_entry(struct xa_state *xas, void *entry) { /* If we were the only waiter woken, wake the next one */ if (entry && !dax_is_conflict(entry)) - dax_wake_entry(xas, entry, false); + dax_wake_entry(xas, entry, WAKE_NEXT); } /* @@ -286,7 +297,7 @@ static void dax_unlock_entry(struct xa_state *xas, void *entry) old = xas_store(xas, entry); xas_unlock_irq(xas); BUG_ON(!dax_is_locked(old)); - dax_wake_entry(xas, entry, false); + dax_wake_entry(xas, entry, WAKE_NEXT); } /* @@ -524,7 +535,7 @@ static void *grab_mapping_entry(struct xa_state *xas, dax_disassociate_entry(entry, mapping, false); xas_store(xas, NULL); /* undo the PMD join */ - dax_wake_entry(xas, entry, true); + dax_wake_entry(xas, entry, WAKE_ALL); mapping->nrexceptional--; entry = NULL; xas_set(xas, index); @@ -937,7 +948,7 @@ static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev, xas_lock_irq(xas); xas_store(xas, entry); xas_clear_mark(xas, PAGECACHE_TAG_DIRTY); - dax_wake_entry(xas, entry, false); + dax_wake_entry(xas, entry, WAKE_NEXT); trace_dax_writeback_one(mapping->host, index, count); return ret; From 4c3d043d271d4d629aa2328796cdfc96b37d3b3c Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 28 Apr 2021 15:03:13 -0400 Subject: [PATCH 066/263] dax: Add a wakeup mode parameter to put_unlocked_entry() As of now put_unlocked_entry() always wakes up next waiter. In next patches we want to wake up all waiters at one callsite. Hence, add a parameter to the function. This patch does not introduce any change of behavior. Reviewed-by: Greg Kurz Reviewed-by: Jan Kara Suggested-by: Dan Williams Signed-off-by: Vivek Goyal Link: https://lore.kernel.org/r/20210428190314.1865312-3-vgoyal@redhat.com Signed-off-by: Dan Williams --- fs/dax.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 5ecee51c44ee7..56eb1c759ca5c 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -275,11 +275,11 @@ static void wait_entry_unlocked(struct xa_state *xas, void *entry) finish_wait(wq, &ewait.wait); } -static void put_unlocked_entry(struct xa_state *xas, void *entry) +static void put_unlocked_entry(struct xa_state *xas, void *entry, + enum dax_wake_mode mode) { - /* If we were the only waiter woken, wake the next one */ if (entry && !dax_is_conflict(entry)) - dax_wake_entry(xas, entry, WAKE_NEXT); + dax_wake_entry(xas, entry, mode); } /* @@ -633,7 +633,7 @@ struct page *dax_layout_busy_page_range(struct address_space *mapping, entry = get_unlocked_entry(&xas, 0); if (entry) page = dax_busy_page(entry); - put_unlocked_entry(&xas, entry); + put_unlocked_entry(&xas, entry, WAKE_NEXT); if (page) break; if (++scanned % XA_CHECK_SCHED) @@ -675,7 +675,7 @@ static int __dax_invalidate_entry(struct address_space *mapping, mapping->nrexceptional--; ret = 1; out: - put_unlocked_entry(&xas, entry); + put_unlocked_entry(&xas, entry, WAKE_NEXT); xas_unlock_irq(&xas); return ret; } @@ -954,7 +954,7 @@ static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev, return ret; put_unlocked: - put_unlocked_entry(xas, entry); + put_unlocked_entry(xas, entry, WAKE_NEXT); return ret; } @@ -1695,7 +1695,7 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order) /* Did we race with someone splitting entry or so? */ if (!entry || dax_is_conflict(entry) || (order == 0 && !dax_is_pte_entry(entry))) { - put_unlocked_entry(&xas, entry); + put_unlocked_entry(&xas, entry, WAKE_NEXT); xas_unlock_irq(&xas); trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf, VM_FAULT_NOPAGE); From 237388320deffde7c2d65ed8fc9eef670dc979b3 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 28 Apr 2021 15:03:14 -0400 Subject: [PATCH 067/263] dax: Wake up all waiters after invalidating dax entry I am seeing missed wakeups which ultimately lead to a deadlock when I am using virtiofs with DAX enabled and running "make -j". I had to mount virtiofs as rootfs and also reduce to dax window size to 256M to reproduce the problem consistently. So here is the problem. put_unlocked_entry() wakes up waiters only if entry is not null as well as !dax_is_conflict(entry). But if I call multiple instances of invalidate_inode_pages2() in parallel, then I can run into a situation where there are waiters on this index but nobody will wake these waiters. invalidate_inode_pages2() invalidate_inode_pages2_range() invalidate_exceptional_entry2() dax_invalidate_mapping_entry_sync() __dax_invalidate_entry() { xas_lock_irq(&xas); entry = get_unlocked_entry(&xas, 0); ... ... dax_disassociate_entry(entry, mapping, trunc); xas_store(&xas, NULL); ... ... put_unlocked_entry(&xas, entry); xas_unlock_irq(&xas); } Say a fault in in progress and it has locked entry at offset say "0x1c". Now say three instances of invalidate_inode_pages2() are in progress (A, B, C) and they all try to invalidate entry at offset "0x1c". Given dax entry is locked, all tree instances A, B, C will wait in wait queue. When dax fault finishes, say A is woken up. It will store NULL entry at index "0x1c" and wake up B. When B comes along it will find "entry=0" at page offset 0x1c and it will call put_unlocked_entry(&xas, 0). And this means put_unlocked_entry() will not wake up next waiter, given the current code. And that means C continues to wait and is not woken up. This patch fixes the issue by waking up all waiters when a dax entry has been invalidated. This seems to fix the deadlock I am facing and I can make forward progress. Reported-by: Sergio Lopez Fixes: ac401cc78242 ("dax: New fault locking") Reviewed-by: Jan Kara Suggested-by: Dan Williams Signed-off-by: Vivek Goyal Link: https://lore.kernel.org/r/20210428190314.1865312-4-vgoyal@redhat.com Signed-off-by: Dan Williams --- fs/dax.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/dax.c b/fs/dax.c index 56eb1c759ca5c..df5485b4bddf1 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -675,7 +675,7 @@ static int __dax_invalidate_entry(struct address_space *mapping, mapping->nrexceptional--; ret = 1; out: - put_unlocked_entry(&xas, entry, WAKE_NEXT); + put_unlocked_entry(&xas, entry, WAKE_ALL); xas_unlock_irq(&xas); return ret; } From a298232ee6b9a1d5d732aa497ff8be0d45b5bd82 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 7 May 2021 21:06:38 +0100 Subject: [PATCH 068/263] io_uring: fix link timeout refs WARNING: CPU: 0 PID: 10242 at lib/refcount.c:28 refcount_warn_saturate+0x15b/0x1a0 lib/refcount.c:28 RIP: 0010:refcount_warn_saturate+0x15b/0x1a0 lib/refcount.c:28 Call Trace: __refcount_sub_and_test include/linux/refcount.h:283 [inline] __refcount_dec_and_test include/linux/refcount.h:315 [inline] refcount_dec_and_test include/linux/refcount.h:333 [inline] io_put_req fs/io_uring.c:2140 [inline] io_queue_linked_timeout fs/io_uring.c:6300 [inline] __io_queue_sqe+0xbef/0xec0 fs/io_uring.c:6354 io_submit_sqe fs/io_uring.c:6534 [inline] io_submit_sqes+0x2bbd/0x7c50 fs/io_uring.c:6660 __do_sys_io_uring_enter fs/io_uring.c:9240 [inline] __se_sys_io_uring_enter+0x256/0x1d60 fs/io_uring.c:9182 io_link_timeout_fn() should put only one reference of the linked timeout request, however in case of racing with the master request's completion first io_req_complete() puts one and then io_put_req_deferred() is called. Cc: stable@vger.kernel.org # 5.12+ Fixes: 9ae1f8dd372e0 ("io_uring: fix inconsistent lock state") Reported-by: syzbot+a2910119328ce8e7996f@syzkaller.appspotmail.com Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/ff51018ff29de5ffa76f09273ef48cb24c720368.1620417627.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index f46acbbeed57c..9ac5e278a91e6 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6363,10 +6363,10 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) if (prev) { io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME); io_put_req_deferred(prev, 1); + io_put_req_deferred(req, 1); } else { io_req_complete_post(req, -ETIME, 0); } - io_put_req_deferred(req, 1); return HRTIMER_NORESTART; } From e759959fe3b8313c81d6200be44cb8a644d845ea Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Tue, 27 Apr 2021 06:16:34 -0500 Subject: [PATCH 069/263] x86/sev-es: Rename sev-es.{ch} to sev.{ch} SEV-SNP builds upon the SEV-ES functionality while adding new hardware protection. Version 2 of the GHCB specification adds new NAE events that are SEV-SNP specific. Rename the sev-es.{ch} to sev.{ch} so that all SEV* functionality can be consolidated in one place. Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Acked-by: Joerg Roedel Link: https://lkml.kernel.org/r/20210427111636.1207-2-brijesh.singh@amd.com --- arch/x86/boot/compressed/Makefile | 6 +++--- arch/x86/boot/compressed/{sev-es.c => sev.c} | 4 ++-- arch/x86/include/asm/{sev-es.h => sev.h} | 0 arch/x86/kernel/Makefile | 6 +++--- arch/x86/kernel/head64.c | 2 +- arch/x86/kernel/nmi.c | 2 +- arch/x86/kernel/{sev-es-shared.c => sev-shared.c} | 0 arch/x86/kernel/{sev-es.c => sev.c} | 4 ++-- arch/x86/mm/extable.c | 2 +- arch/x86/platform/efi/efi_64.c | 2 +- arch/x86/realmode/init.c | 2 +- 11 files changed, 15 insertions(+), 15 deletions(-) rename arch/x86/boot/compressed/{sev-es.c => sev.c} (98%) rename arch/x86/include/asm/{sev-es.h => sev.h} (100%) rename arch/x86/kernel/{sev-es-shared.c => sev-shared.c} (100%) rename arch/x86/kernel/{sev-es.c => sev.c} (99%) diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 6e5522aebbbd4..2a2975236c9e3 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -48,10 +48,10 @@ KBUILD_CFLAGS += $(call as-option,-Wa$(comma)-mrelax-relocations=no) KBUILD_CFLAGS += -include $(srctree)/include/linux/hidden.h KBUILD_CFLAGS += $(CLANG_FLAGS) -# sev-es.c indirectly inludes inat-table.h which is generated during +# sev.c indirectly inludes inat-table.h which is generated during # compilation and stored in $(objtree). Add the directory to the includes so # that the compiler finds it even with out-of-tree builds (make O=/some/path). -CFLAGS_sev-es.o += -I$(objtree)/arch/x86/lib/ +CFLAGS_sev.o += -I$(objtree)/arch/x86/lib/ KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ GCOV_PROFILE := n @@ -93,7 +93,7 @@ ifdef CONFIG_X86_64 vmlinux-objs-y += $(obj)/idt_64.o $(obj)/idt_handlers_64.o vmlinux-objs-y += $(obj)/mem_encrypt.o vmlinux-objs-y += $(obj)/pgtable_64.o - vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/sev-es.o + vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/sev.o endif vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o diff --git a/arch/x86/boot/compressed/sev-es.c b/arch/x86/boot/compressed/sev.c similarity index 98% rename from arch/x86/boot/compressed/sev-es.c rename to arch/x86/boot/compressed/sev.c index 82041bd380e56..670e998fe9306 100644 --- a/arch/x86/boot/compressed/sev-es.c +++ b/arch/x86/boot/compressed/sev.c @@ -13,7 +13,7 @@ #include "misc.h" #include -#include +#include #include #include #include @@ -117,7 +117,7 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, #include "../../lib/insn.c" /* Include code for early handlers */ -#include "../../kernel/sev-es-shared.c" +#include "../../kernel/sev-shared.c" static bool early_setup_sev_es(void) { diff --git a/arch/x86/include/asm/sev-es.h b/arch/x86/include/asm/sev.h similarity index 100% rename from arch/x86/include/asm/sev-es.h rename to arch/x86/include/asm/sev.h diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 0704c2a94272c..0f66682ac02a6 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -20,7 +20,7 @@ CFLAGS_REMOVE_kvmclock.o = -pg CFLAGS_REMOVE_ftrace.o = -pg CFLAGS_REMOVE_early_printk.o = -pg CFLAGS_REMOVE_head64.o = -pg -CFLAGS_REMOVE_sev-es.o = -pg +CFLAGS_REMOVE_sev.o = -pg endif KASAN_SANITIZE_head$(BITS).o := n @@ -28,7 +28,7 @@ KASAN_SANITIZE_dumpstack.o := n KASAN_SANITIZE_dumpstack_$(BITS).o := n KASAN_SANITIZE_stacktrace.o := n KASAN_SANITIZE_paravirt.o := n -KASAN_SANITIZE_sev-es.o := n +KASAN_SANITIZE_sev.o := n # With some compiler versions the generated code results in boot hangs, caused # by several compilation units. To be safe, disable all instrumentation. @@ -148,7 +148,7 @@ obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o -obj-$(CONFIG_AMD_MEM_ENCRYPT) += sev-es.o +obj-$(CONFIG_AMD_MEM_ENCRYPT) += sev.o ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 18be44163a50f..de01903c37355 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -39,7 +39,7 @@ #include #include #include -#include +#include /* * Manage page tables very early on. diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 2ef961cf4cfc5..4bce802d25fb1 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -33,7 +33,7 @@ #include #include #include -#include +#include #define CREATE_TRACE_POINTS #include diff --git a/arch/x86/kernel/sev-es-shared.c b/arch/x86/kernel/sev-shared.c similarity index 100% rename from arch/x86/kernel/sev-es-shared.c rename to arch/x86/kernel/sev-shared.c diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev.c similarity index 99% rename from arch/x86/kernel/sev-es.c rename to arch/x86/kernel/sev.c index 73873b0078380..9578c82832aa2 100644 --- a/arch/x86/kernel/sev-es.c +++ b/arch/x86/kernel/sev.c @@ -22,7 +22,7 @@ #include #include -#include +#include #include #include #include @@ -459,7 +459,7 @@ static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt } /* Include code shared with pre-decompression boot stage */ -#include "sev-es-shared.c" +#include "sev-shared.c" void noinstr __sev_es_nmi_complete(void) { diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index b93d6cd08a7ff..121921b2927cb 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -5,7 +5,7 @@ #include #include -#include +#include #include #include diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index df7b5477fc4f2..7515e78ef8983 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -47,7 +47,7 @@ #include #include #include -#include +#include /* * We allocate runtime services regions top-down, starting from -4G, i.e. diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index 1be71ef5e4c4e..2e1c1bec0f9e7 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include struct real_mode_header *real_mode_header; u32 *trampoline_cr4_features; From b81fc74d53d1248de6db3136dd6b29e5d5528021 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Tue, 27 Apr 2021 06:16:35 -0500 Subject: [PATCH 070/263] x86/sev: Move GHCB MSR protocol and NAE definitions in a common header The guest and the hypervisor contain separate macros to get and set the GHCB MSR protocol and NAE event fields. Consolidate the GHCB protocol definitions and helper macros in one place. Leave the supported protocol version define in separate files to keep the guest and hypervisor flexibility to support different GHCB version in the same release. There is no functional change intended. Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Acked-by: Joerg Roedel Link: https://lkml.kernel.org/r/20210427111636.1207-3-brijesh.singh@amd.com --- arch/x86/include/asm/sev-common.h | 62 +++++++++++++++++++++++++++++++ arch/x86/include/asm/sev.h | 30 ++------------- arch/x86/kernel/sev-shared.c | 20 +++++----- arch/x86/kvm/svm/svm.h | 38 ++----------------- 4 files changed, 80 insertions(+), 70 deletions(-) create mode 100644 arch/x86/include/asm/sev-common.h diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h new file mode 100644 index 0000000000000..629c3df243f03 --- /dev/null +++ b/arch/x86/include/asm/sev-common.h @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * AMD SEV header common between the guest and the hypervisor. + * + * Author: Brijesh Singh + */ + +#ifndef __ASM_X86_SEV_COMMON_H +#define __ASM_X86_SEV_COMMON_H + +#define GHCB_MSR_INFO_POS 0 +#define GHCB_MSR_INFO_MASK (BIT_ULL(12) - 1) + +#define GHCB_MSR_SEV_INFO_RESP 0x001 +#define GHCB_MSR_SEV_INFO_REQ 0x002 +#define GHCB_MSR_VER_MAX_POS 48 +#define GHCB_MSR_VER_MAX_MASK 0xffff +#define GHCB_MSR_VER_MIN_POS 32 +#define GHCB_MSR_VER_MIN_MASK 0xffff +#define GHCB_MSR_CBIT_POS 24 +#define GHCB_MSR_CBIT_MASK 0xff +#define GHCB_MSR_SEV_INFO(_max, _min, _cbit) \ + ((((_max) & GHCB_MSR_VER_MAX_MASK) << GHCB_MSR_VER_MAX_POS) | \ + (((_min) & GHCB_MSR_VER_MIN_MASK) << GHCB_MSR_VER_MIN_POS) | \ + (((_cbit) & GHCB_MSR_CBIT_MASK) << GHCB_MSR_CBIT_POS) | \ + GHCB_MSR_SEV_INFO_RESP) +#define GHCB_MSR_INFO(v) ((v) & 0xfffUL) +#define GHCB_MSR_PROTO_MAX(v) (((v) >> GHCB_MSR_VER_MAX_POS) & GHCB_MSR_VER_MAX_MASK) +#define GHCB_MSR_PROTO_MIN(v) (((v) >> GHCB_MSR_VER_MIN_POS) & GHCB_MSR_VER_MIN_MASK) + +#define GHCB_MSR_CPUID_REQ 0x004 +#define GHCB_MSR_CPUID_RESP 0x005 +#define GHCB_MSR_CPUID_FUNC_POS 32 +#define GHCB_MSR_CPUID_FUNC_MASK 0xffffffff +#define GHCB_MSR_CPUID_VALUE_POS 32 +#define GHCB_MSR_CPUID_VALUE_MASK 0xffffffff +#define GHCB_MSR_CPUID_REG_POS 30 +#define GHCB_MSR_CPUID_REG_MASK 0x3 +#define GHCB_CPUID_REQ_EAX 0 +#define GHCB_CPUID_REQ_EBX 1 +#define GHCB_CPUID_REQ_ECX 2 +#define GHCB_CPUID_REQ_EDX 3 +#define GHCB_CPUID_REQ(fn, reg) \ + (GHCB_MSR_CPUID_REQ | \ + (((unsigned long)reg & GHCB_MSR_CPUID_REG_MASK) << GHCB_MSR_CPUID_REG_POS) | \ + (((unsigned long)fn) << GHCB_MSR_CPUID_FUNC_POS)) + +#define GHCB_MSR_TERM_REQ 0x100 +#define GHCB_MSR_TERM_REASON_SET_POS 12 +#define GHCB_MSR_TERM_REASON_SET_MASK 0xf +#define GHCB_MSR_TERM_REASON_POS 16 +#define GHCB_MSR_TERM_REASON_MASK 0xff +#define GHCB_SEV_TERM_REASON(reason_set, reason_val) \ + (((((u64)reason_set) & GHCB_MSR_TERM_REASON_SET_MASK) << GHCB_MSR_TERM_REASON_SET_POS) | \ + ((((u64)reason_val) & GHCB_MSR_TERM_REASON_MASK) << GHCB_MSR_TERM_REASON_POS)) + +#define GHCB_SEV_ES_REASON_GENERAL_REQUEST 0 +#define GHCB_SEV_ES_REASON_PROTOCOL_UNSUPPORTED 1 + +#define GHCB_RESP_CODE(v) ((v) & GHCB_MSR_INFO_MASK) + +#endif diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index cf1d957c70919..fa5cd05d3b5be 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -10,34 +10,12 @@ #include #include +#include -#define GHCB_SEV_INFO 0x001UL -#define GHCB_SEV_INFO_REQ 0x002UL -#define GHCB_INFO(v) ((v) & 0xfffUL) -#define GHCB_PROTO_MAX(v) (((v) >> 48) & 0xffffUL) -#define GHCB_PROTO_MIN(v) (((v) >> 32) & 0xffffUL) -#define GHCB_PROTO_OUR 0x0001UL -#define GHCB_SEV_CPUID_REQ 0x004UL -#define GHCB_CPUID_REQ_EAX 0 -#define GHCB_CPUID_REQ_EBX 1 -#define GHCB_CPUID_REQ_ECX 2 -#define GHCB_CPUID_REQ_EDX 3 -#define GHCB_CPUID_REQ(fn, reg) (GHCB_SEV_CPUID_REQ | \ - (((unsigned long)reg & 3) << 30) | \ - (((unsigned long)fn) << 32)) +#define GHCB_PROTO_OUR 0x0001UL +#define GHCB_PROTOCOL_MAX 1ULL +#define GHCB_DEFAULT_USAGE 0ULL -#define GHCB_PROTOCOL_MAX 0x0001UL -#define GHCB_DEFAULT_USAGE 0x0000UL - -#define GHCB_SEV_CPUID_RESP 0x005UL -#define GHCB_SEV_TERMINATE 0x100UL -#define GHCB_SEV_TERMINATE_REASON(reason_set, reason_val) \ - (((((u64)reason_set) & 0x7) << 12) | \ - ((((u64)reason_val) & 0xff) << 16)) -#define GHCB_SEV_ES_REASON_GENERAL_REQUEST 0 -#define GHCB_SEV_ES_REASON_PROTOCOL_UNSUPPORTED 1 - -#define GHCB_SEV_GHCB_RESP_CODE(v) ((v) & 0xfff) #define VMGEXIT() { asm volatile("rep; vmmcall\n\r"); } enum es_result { diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index 0aa9f13efd572..6ec8b3bfd76eb 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -26,13 +26,13 @@ static bool __init sev_es_check_cpu_features(void) static void __noreturn sev_es_terminate(unsigned int reason) { - u64 val = GHCB_SEV_TERMINATE; + u64 val = GHCB_MSR_TERM_REQ; /* * Tell the hypervisor what went wrong - only reason-set 0 is * currently supported. */ - val |= GHCB_SEV_TERMINATE_REASON(0, reason); + val |= GHCB_SEV_TERM_REASON(0, reason); /* Request Guest Termination from Hypvervisor */ sev_es_wr_ghcb_msr(val); @@ -47,15 +47,15 @@ static bool sev_es_negotiate_protocol(void) u64 val; /* Do the GHCB protocol version negotiation */ - sev_es_wr_ghcb_msr(GHCB_SEV_INFO_REQ); + sev_es_wr_ghcb_msr(GHCB_MSR_SEV_INFO_REQ); VMGEXIT(); val = sev_es_rd_ghcb_msr(); - if (GHCB_INFO(val) != GHCB_SEV_INFO) + if (GHCB_MSR_INFO(val) != GHCB_MSR_SEV_INFO_RESP) return false; - if (GHCB_PROTO_MAX(val) < GHCB_PROTO_OUR || - GHCB_PROTO_MIN(val) > GHCB_PROTO_OUR) + if (GHCB_MSR_PROTO_MAX(val) < GHCB_PROTO_OUR || + GHCB_MSR_PROTO_MIN(val) > GHCB_PROTO_OUR) return false; return true; @@ -153,28 +153,28 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EAX)); VMGEXIT(); val = sev_es_rd_ghcb_msr(); - if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP) + if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP) goto fail; regs->ax = val >> 32; sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EBX)); VMGEXIT(); val = sev_es_rd_ghcb_msr(); - if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP) + if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP) goto fail; regs->bx = val >> 32; sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_ECX)); VMGEXIT(); val = sev_es_rd_ghcb_msr(); - if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP) + if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP) goto fail; regs->cx = val >> 32; sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EDX)); VMGEXIT(); val = sev_es_rd_ghcb_msr(); - if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP) + if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP) goto fail; regs->dx = val >> 32; diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 84b3133c2251d..42f8a7b9048fb 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -20,6 +20,7 @@ #include #include +#include #define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT) @@ -525,40 +526,9 @@ void svm_vcpu_unblocking(struct kvm_vcpu *vcpu); /* sev.c */ -#define GHCB_VERSION_MAX 1ULL -#define GHCB_VERSION_MIN 1ULL - -#define GHCB_MSR_INFO_POS 0 -#define GHCB_MSR_INFO_MASK (BIT_ULL(12) - 1) - -#define GHCB_MSR_SEV_INFO_RESP 0x001 -#define GHCB_MSR_SEV_INFO_REQ 0x002 -#define GHCB_MSR_VER_MAX_POS 48 -#define GHCB_MSR_VER_MAX_MASK 0xffff -#define GHCB_MSR_VER_MIN_POS 32 -#define GHCB_MSR_VER_MIN_MASK 0xffff -#define GHCB_MSR_CBIT_POS 24 -#define GHCB_MSR_CBIT_MASK 0xff -#define GHCB_MSR_SEV_INFO(_max, _min, _cbit) \ - ((((_max) & GHCB_MSR_VER_MAX_MASK) << GHCB_MSR_VER_MAX_POS) | \ - (((_min) & GHCB_MSR_VER_MIN_MASK) << GHCB_MSR_VER_MIN_POS) | \ - (((_cbit) & GHCB_MSR_CBIT_MASK) << GHCB_MSR_CBIT_POS) | \ - GHCB_MSR_SEV_INFO_RESP) - -#define GHCB_MSR_CPUID_REQ 0x004 -#define GHCB_MSR_CPUID_RESP 0x005 -#define GHCB_MSR_CPUID_FUNC_POS 32 -#define GHCB_MSR_CPUID_FUNC_MASK 0xffffffff -#define GHCB_MSR_CPUID_VALUE_POS 32 -#define GHCB_MSR_CPUID_VALUE_MASK 0xffffffff -#define GHCB_MSR_CPUID_REG_POS 30 -#define GHCB_MSR_CPUID_REG_MASK 0x3 - -#define GHCB_MSR_TERM_REQ 0x100 -#define GHCB_MSR_TERM_REASON_SET_POS 12 -#define GHCB_MSR_TERM_REASON_SET_MASK 0xf -#define GHCB_MSR_TERM_REASON_POS 16 -#define GHCB_MSR_TERM_REASON_MASK 0xff +#define GHCB_VERSION_MAX 1ULL +#define GHCB_VERSION_MIN 1ULL + extern unsigned int max_sev_asid; From 059e5c321a65657877924256ea8ad9c0df257b45 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Tue, 27 Apr 2021 06:16:36 -0500 Subject: [PATCH 071/263] x86/msr: Rename MSR_K8_SYSCFG to MSR_AMD64_SYSCFG The SYSCFG MSR continued being updated beyond the K8 family; drop the K8 name from it. Suggested-by: Borislav Petkov Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Acked-by: Joerg Roedel Link: https://lkml.kernel.org/r/20210427111636.1207-4-brijesh.singh@amd.com --- Documentation/virt/kvm/amd-memory-encryption.rst | 2 +- Documentation/x86/amd-memory-encryption.rst | 6 +++--- arch/x86/include/asm/msr-index.h | 6 +++--- arch/x86/kernel/cpu/amd.c | 4 ++-- arch/x86/kernel/cpu/mtrr/cleanup.c | 2 +- arch/x86/kernel/cpu/mtrr/generic.c | 4 ++-- arch/x86/kernel/mmconf-fam10h_64.c | 2 +- arch/x86/kvm/svm/svm.c | 4 ++-- arch/x86/kvm/x86.c | 2 +- arch/x86/mm/mem_encrypt_identity.c | 6 +++--- arch/x86/pci/amd_bus.c | 2 +- arch/x86/realmode/rm/trampoline_64.S | 4 ++-- drivers/edac/amd64_edac.c | 2 +- tools/arch/x86/include/asm/msr-index.h | 6 +++--- 14 files changed, 26 insertions(+), 26 deletions(-) diff --git a/Documentation/virt/kvm/amd-memory-encryption.rst b/Documentation/virt/kvm/amd-memory-encryption.rst index 5ec8a1902e15a..5c081c8c7164a 100644 --- a/Documentation/virt/kvm/amd-memory-encryption.rst +++ b/Documentation/virt/kvm/amd-memory-encryption.rst @@ -22,7 +22,7 @@ to SEV:: [ecx]: Bits[31:0] Number of encrypted guests supported simultaneously -If support for SEV is present, MSR 0xc001_0010 (MSR_K8_SYSCFG) and MSR 0xc001_0015 +If support for SEV is present, MSR 0xc001_0010 (MSR_AMD64_SYSCFG) and MSR 0xc001_0015 (MSR_K7_HWCR) can be used to determine if it can be enabled:: 0xc001_0010: diff --git a/Documentation/x86/amd-memory-encryption.rst b/Documentation/x86/amd-memory-encryption.rst index c48d452d07189..a1940ebe7be50 100644 --- a/Documentation/x86/amd-memory-encryption.rst +++ b/Documentation/x86/amd-memory-encryption.rst @@ -53,7 +53,7 @@ CPUID function 0x8000001f reports information related to SME:: system physical addresses, not guest physical addresses) -If support for SME is present, MSR 0xc00100010 (MSR_K8_SYSCFG) can be used to +If support for SME is present, MSR 0xc00100010 (MSR_AMD64_SYSCFG) can be used to determine if SME is enabled and/or to enable memory encryption:: 0xc0010010: @@ -79,7 +79,7 @@ The state of SME in the Linux kernel can be documented as follows: The CPU supports SME (determined through CPUID instruction). - Enabled: - Supported and bit 23 of MSR_K8_SYSCFG is set. + Supported and bit 23 of MSR_AMD64_SYSCFG is set. - Active: Supported, Enabled and the Linux kernel is actively applying @@ -89,7 +89,7 @@ The state of SME in the Linux kernel can be documented as follows: SME can also be enabled and activated in the BIOS. If SME is enabled and activated in the BIOS, then all memory accesses will be encrypted and it will not be necessary to activate the Linux memory encryption support. If the BIOS -merely enables SME (sets bit 23 of the MSR_K8_SYSCFG), then Linux can activate +merely enables SME (sets bit 23 of the MSR_AMD64_SYSCFG), then Linux can activate memory encryption by default (CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=y) or by supplying mem_encrypt=on on the kernel command line. However, if BIOS does not enable SME, then Linux will not be able to activate memory encryption, even diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 742d89a00721d..211ba3375ee96 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -537,9 +537,9 @@ /* K8 MSRs */ #define MSR_K8_TOP_MEM1 0xc001001a #define MSR_K8_TOP_MEM2 0xc001001d -#define MSR_K8_SYSCFG 0xc0010010 -#define MSR_K8_SYSCFG_MEM_ENCRYPT_BIT 23 -#define MSR_K8_SYSCFG_MEM_ENCRYPT BIT_ULL(MSR_K8_SYSCFG_MEM_ENCRYPT_BIT) +#define MSR_AMD64_SYSCFG 0xc0010010 +#define MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT 23 +#define MSR_AMD64_SYSCFG_MEM_ENCRYPT BIT_ULL(MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT) #define MSR_K8_INT_PENDING_MSG 0xc0010055 /* C1E active bits in int pending message */ #define K8_INTP_C1E_ACTIVE_MASK 0x18000000 diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 2d11384dc9ab4..0adb0341cd7c5 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -593,8 +593,8 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) */ if (cpu_has(c, X86_FEATURE_SME) || cpu_has(c, X86_FEATURE_SEV)) { /* Check if memory encryption is enabled */ - rdmsrl(MSR_K8_SYSCFG, msr); - if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) + rdmsrl(MSR_AMD64_SYSCFG, msr); + if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) goto clear_all; /* diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 0c3b372318b70..b5f43049fa5f7 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -836,7 +836,7 @@ int __init amd_special_default_mtrr(void) if (boot_cpu_data.x86 < 0xf) return 0; /* In case some hypervisor doesn't pass SYSCFG through: */ - if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0) + if (rdmsr_safe(MSR_AMD64_SYSCFG, &l, &h) < 0) return 0; /* * Memory between 4GB and top of mem is forced WB by this magic bit. diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index b90f3f437765c..558108296f3cf 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -53,13 +53,13 @@ static inline void k8_check_syscfg_dram_mod_en(void) (boot_cpu_data.x86 >= 0x0f))) return; - rdmsr(MSR_K8_SYSCFG, lo, hi); + rdmsr(MSR_AMD64_SYSCFG, lo, hi); if (lo & K8_MTRRFIXRANGE_DRAM_MODIFY) { pr_err(FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]" " not cleared by BIOS, clearing this bit\n", smp_processor_id()); lo &= ~K8_MTRRFIXRANGE_DRAM_MODIFY; - mtrr_wrmsr(MSR_K8_SYSCFG, lo, hi); + mtrr_wrmsr(MSR_AMD64_SYSCFG, lo, hi); } } diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c index b5cb49e57df85..c94dec6a18345 100644 --- a/arch/x86/kernel/mmconf-fam10h_64.c +++ b/arch/x86/kernel/mmconf-fam10h_64.c @@ -95,7 +95,7 @@ static void get_fam10h_pci_mmconf_base(void) return; /* SYS_CFG */ - address = MSR_K8_SYSCFG; + address = MSR_AMD64_SYSCFG; rdmsrl(address, val); /* TOP_MEM2 is not enabled? */ diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index b649f92287a2e..433e8e4fb3a65 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -858,8 +858,8 @@ static __init void svm_adjust_mmio_mask(void) return; /* If memory encryption is not enabled, use existing mask */ - rdmsrl(MSR_K8_SYSCFG, msr); - if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) + rdmsrl(MSR_AMD64_SYSCFG, msr); + if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) return; enc_bit = cpuid_ebx(0x8000001f) & 0x3f; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6eda2834fc05e..853c40e893352 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3402,7 +3402,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_LASTBRANCHTOIP: case MSR_IA32_LASTINTFROMIP: case MSR_IA32_LASTINTTOIP: - case MSR_K8_SYSCFG: + case MSR_AMD64_SYSCFG: case MSR_K8_TSEG_ADDR: case MSR_K8_TSEG_MASK: case MSR_VM_HSAVE_PA: diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index 04aba7e80a362..a9639f663d25f 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -529,7 +529,7 @@ void __init sme_enable(struct boot_params *bp) /* * No SME if Hypervisor bit is set. This check is here to * prevent a guest from trying to enable SME. For running as a - * KVM guest the MSR_K8_SYSCFG will be sufficient, but there + * KVM guest the MSR_AMD64_SYSCFG will be sufficient, but there * might be other hypervisors which emulate that MSR as non-zero * or even pass it through to the guest. * A malicious hypervisor can still trick a guest into this @@ -542,8 +542,8 @@ void __init sme_enable(struct boot_params *bp) return; /* For SME, check the SYSCFG MSR */ - msr = __rdmsr(MSR_K8_SYSCFG); - if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) + msr = __rdmsr(MSR_AMD64_SYSCFG); + if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) return; } else { /* SEV state cannot be controlled by a command line option */ diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index ae744b6a07856..dd40d3fea74e4 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -284,7 +284,7 @@ static int __init early_root_info_init(void) /* need to take out [4G, TOM2) for RAM*/ /* SYS_CFG */ - address = MSR_K8_SYSCFG; + address = MSR_AMD64_SYSCFG; rdmsrl(address, val); /* TOP_MEM2 is enabled? */ if (val & (1<<21)) { diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S index 84c5d1b33d100..cc8391f86cdb6 100644 --- a/arch/x86/realmode/rm/trampoline_64.S +++ b/arch/x86/realmode/rm/trampoline_64.S @@ -123,9 +123,9 @@ SYM_CODE_START(startup_32) */ btl $TH_FLAGS_SME_ACTIVE_BIT, pa_tr_flags jnc .Ldone - movl $MSR_K8_SYSCFG, %ecx + movl $MSR_AMD64_SYSCFG, %ecx rdmsr - bts $MSR_K8_SYSCFG_MEM_ENCRYPT_BIT, %eax + bts $MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT, %eax jc .Ldone /* diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 9fa4dfc6ebee6..f0d8f60acee10 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -3083,7 +3083,7 @@ static void read_mc_regs(struct amd64_pvt *pvt) edac_dbg(0, " TOP_MEM: 0x%016llx\n", pvt->top_mem); /* Check first whether TOP_MEM2 is enabled: */ - rdmsrl(MSR_K8_SYSCFG, msr_val); + rdmsrl(MSR_AMD64_SYSCFG, msr_val); if (msr_val & BIT(21)) { rdmsrl(MSR_K8_TOP_MEM2, pvt->top_mem2); edac_dbg(0, " TOP_MEM2: 0x%016llx\n", pvt->top_mem2); diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index 45029354e0a8b..c60b09e7602f4 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -533,9 +533,9 @@ /* K8 MSRs */ #define MSR_K8_TOP_MEM1 0xc001001a #define MSR_K8_TOP_MEM2 0xc001001d -#define MSR_K8_SYSCFG 0xc0010010 -#define MSR_K8_SYSCFG_MEM_ENCRYPT_BIT 23 -#define MSR_K8_SYSCFG_MEM_ENCRYPT BIT_ULL(MSR_K8_SYSCFG_MEM_ENCRYPT_BIT) +#define MSR_AMD64_SYSCFG 0xc0010010 +#define MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT 23 +#define MSR_AMD64_SYSCFG_MEM_ENCRYPT BIT_ULL(MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT) #define MSR_K8_INT_PENDING_MSG 0xc0010055 /* C1E active bits in int pending message */ #define K8_INTP_C1E_ACTIVE_MASK 0x18000000 From 970655aa9b42461f8394e4457307005bdeee14d9 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 23 Apr 2021 07:40:38 +0200 Subject: [PATCH 072/263] xen/gntdev: fix gntdev_mmap() error exit path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit d3eeb1d77c5d0af ("xen/gntdev: use mmu_interval_notifier_insert") introduced an error in gntdev_mmap(): in case the call of mmu_interval_notifier_insert_locked() fails the exit path should not call mmu_interval_notifier_remove(), as this might result in NULL dereferences. One reason for failure is e.g. a signal pending for the running process. Fixes: d3eeb1d77c5d0af ("xen/gntdev: use mmu_interval_notifier_insert") Cc: stable@vger.kernel.org Reported-by: Marek Marczykowski-Górecki Tested-by: Marek Marczykowski-Górecki Signed-off-by: Juergen Gross Reviewed-by: Luca Fancellu Link: https://lore.kernel.org/r/20210423054038.26696-1-jgross@suse.com Signed-off-by: Juergen Gross --- drivers/xen/gntdev.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index f01d58c7a042e..a3e7be96527d7 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -1017,8 +1017,10 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) err = mmu_interval_notifier_insert_locked( &map->notifier, vma->vm_mm, vma->vm_start, vma->vm_end - vma->vm_start, &gntdev_mmu_ops); - if (err) + if (err) { + map->vma = NULL; goto out_unlock_put; + } } mutex_unlock(&priv->lock); From dbc03e81586fc33e4945263fd6e09e22eb4b980f Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Sat, 8 May 2021 10:19:13 +0800 Subject: [PATCH 073/263] xen/unpopulated-alloc: fix error return code in fill_list() Fix to return a negative error code from the error handling case instead of 0, as done elsewhere in this function. Fixes: a4574f63edc6 ("mm/memremap_pages: convert to 'struct range'") Reported-by: Hulk Robot Signed-off-by: Zhen Lei Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20210508021913.1727-1-thunder.leizhen@huawei.com Signed-off-by: Juergen Gross --- drivers/xen/unpopulated-alloc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/xen/unpopulated-alloc.c b/drivers/xen/unpopulated-alloc.c index e64e6befc63b7..87e6b7db892f5 100644 --- a/drivers/xen/unpopulated-alloc.c +++ b/drivers/xen/unpopulated-alloc.c @@ -39,8 +39,10 @@ static int fill_list(unsigned int nr_pages) } pgmap = kzalloc(sizeof(*pgmap), GFP_KERNEL); - if (!pgmap) + if (!pgmap) { + ret = -ENOMEM; goto err_pgmap; + } pgmap->type = MEMORY_DEVICE_GENERIC; pgmap->range = (struct range) { From 0c6c2d3615efb7c292573f2e6c886929a2b2da6c Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 28 Apr 2021 13:12:31 +0100 Subject: [PATCH 074/263] arm64: Generate cpucaps.h The arm64 code allocates an internal constant to every CPU feature it can detect, distinct from the public hwcap numbers we use to expose some features to userspace. Currently this is maintained manually which is an irritating source of conflicts when working on new features, to avoid this replace the header with a simple text file listing the names we've assigned and sort it to minimise conflicts. As part of doing this we also do the Kbuild hookup required to hook up an arch tools directory and to generate header files in there. This will result in a renumbering and reordering of the existing constants, since they are all internal only the values should not be important. The reordering will impact the order in which some steps in enumeration handle features but the algorithm is not intended to depend on this and I haven't seen any issues when testing. Due to the UAO cpucap having been removed in the past we end up with ARM64_NCAPS being 1 smaller than it was before. Signed-off-by: Mark Brown Reviewed-by: Mark Rutland Tested-by: Mark Rutland Link: https://lore.kernel.org/r/20210428121231.11219-1-broonie@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/Makefile | 3 ++ arch/arm64/include/asm/Kbuild | 2 + arch/arm64/include/asm/cpucaps.h | 74 -------------------------------- arch/arm64/tools/Makefile | 22 ++++++++++ arch/arm64/tools/cpucaps | 65 ++++++++++++++++++++++++++++ arch/arm64/tools/gen-cpucaps.awk | 40 +++++++++++++++++ 6 files changed, 132 insertions(+), 74 deletions(-) delete mode 100644 arch/arm64/include/asm/cpucaps.h create mode 100644 arch/arm64/tools/Makefile create mode 100644 arch/arm64/tools/cpucaps create mode 100755 arch/arm64/tools/gen-cpucaps.awk diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index 7ef44478560df..b52481f0605d8 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -175,6 +175,9 @@ vdso_install: $(if $(CONFIG_COMPAT_VDSO), \ $(Q)$(MAKE) $(build)=arch/arm64/kernel/vdso32 $@) +archprepare: + $(Q)$(MAKE) $(build)=arch/arm64/tools kapi + # We use MRPROPER_FILES and CLEAN_FILES now archclean: $(Q)$(MAKE) $(clean)=$(boot) diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild index 07ac208edc894..26889dbfe904d 100644 --- a/arch/arm64/include/asm/Kbuild +++ b/arch/arm64/include/asm/Kbuild @@ -5,3 +5,5 @@ generic-y += qrwlock.h generic-y += qspinlock.h generic-y += set_memory.h generic-y += user.h + +generated-y += cpucaps.h diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h deleted file mode 100644 index b0c5eda0498f2..0000000000000 --- a/arch/arm64/include/asm/cpucaps.h +++ /dev/null @@ -1,74 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * arch/arm64/include/asm/cpucaps.h - * - * Copyright (C) 2016 ARM Ltd. - */ -#ifndef __ASM_CPUCAPS_H -#define __ASM_CPUCAPS_H - -#define ARM64_WORKAROUND_CLEAN_CACHE 0 -#define ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE 1 -#define ARM64_WORKAROUND_845719 2 -#define ARM64_HAS_SYSREG_GIC_CPUIF 3 -#define ARM64_HAS_PAN 4 -#define ARM64_HAS_LSE_ATOMICS 5 -#define ARM64_WORKAROUND_CAVIUM_23154 6 -#define ARM64_WORKAROUND_834220 7 -#define ARM64_HAS_NO_HW_PREFETCH 8 -#define ARM64_HAS_VIRT_HOST_EXTN 11 -#define ARM64_WORKAROUND_CAVIUM_27456 12 -#define ARM64_HAS_32BIT_EL0 13 -#define ARM64_SPECTRE_V3A 14 -#define ARM64_HAS_CNP 15 -#define ARM64_HAS_NO_FPSIMD 16 -#define ARM64_WORKAROUND_REPEAT_TLBI 17 -#define ARM64_WORKAROUND_QCOM_FALKOR_E1003 18 -#define ARM64_WORKAROUND_858921 19 -#define ARM64_WORKAROUND_CAVIUM_30115 20 -#define ARM64_HAS_DCPOP 21 -#define ARM64_SVE 22 -#define ARM64_UNMAP_KERNEL_AT_EL0 23 -#define ARM64_SPECTRE_V2 24 -#define ARM64_HAS_RAS_EXTN 25 -#define ARM64_WORKAROUND_843419 26 -#define ARM64_HAS_CACHE_IDC 27 -#define ARM64_HAS_CACHE_DIC 28 -#define ARM64_HW_DBM 29 -#define ARM64_SPECTRE_V4 30 -#define ARM64_MISMATCHED_CACHE_TYPE 31 -#define ARM64_HAS_STAGE2_FWB 32 -#define ARM64_HAS_CRC32 33 -#define ARM64_SSBS 34 -#define ARM64_WORKAROUND_1418040 35 -#define ARM64_HAS_SB 36 -#define ARM64_WORKAROUND_SPECULATIVE_AT 37 -#define ARM64_HAS_ADDRESS_AUTH_ARCH 38 -#define ARM64_HAS_ADDRESS_AUTH_IMP_DEF 39 -#define ARM64_HAS_GENERIC_AUTH_ARCH 40 -#define ARM64_HAS_GENERIC_AUTH_IMP_DEF 41 -#define ARM64_HAS_IRQ_PRIO_MASKING 42 -#define ARM64_HAS_DCPODP 43 -#define ARM64_WORKAROUND_1463225 44 -#define ARM64_WORKAROUND_CAVIUM_TX2_219_TVM 45 -#define ARM64_WORKAROUND_CAVIUM_TX2_219_PRFM 46 -#define ARM64_WORKAROUND_1542419 47 -#define ARM64_HAS_E0PD 48 -#define ARM64_HAS_RNG 49 -#define ARM64_HAS_AMU_EXTN 50 -#define ARM64_HAS_ADDRESS_AUTH 51 -#define ARM64_HAS_GENERIC_AUTH 52 -#define ARM64_HAS_32BIT_EL1 53 -#define ARM64_BTI 54 -#define ARM64_HAS_ARMv8_4_TTL 55 -#define ARM64_HAS_TLB_RANGE 56 -#define ARM64_MTE 57 -#define ARM64_WORKAROUND_1508412 58 -#define ARM64_HAS_LDAPR 59 -#define ARM64_KVM_PROTECTED_MODE 60 -#define ARM64_WORKAROUND_NVIDIA_CARMEL_CNP 61 -#define ARM64_HAS_EPAN 62 - -#define ARM64_NCAPS 63 - -#endif /* __ASM_CPUCAPS_H */ diff --git a/arch/arm64/tools/Makefile b/arch/arm64/tools/Makefile new file mode 100644 index 0000000000000..932b4fe5c7684 --- /dev/null +++ b/arch/arm64/tools/Makefile @@ -0,0 +1,22 @@ +# SPDX-License-Identifier: GPL-2.0 + +gen := arch/$(ARCH)/include/generated +kapi := $(gen)/asm + +kapi-hdrs-y := $(kapi)/cpucaps.h + +targets += $(addprefix ../../../,$(gen-y) $(kapi-hdrs-y)) + +PHONY += kapi + +kapi: $(kapi-hdrs-y) $(gen-y) + +# Create output directory if not already present +_dummy := $(shell [ -d '$(kapi)' ] || mkdir -p '$(kapi)') + +quiet_cmd_gen_cpucaps = GEN $@ + cmd_gen_cpucaps = mkdir -p $(dir $@) && \ + $(AWK) -f $(filter-out $(PHONY),$^) > $@ + +$(kapi)/cpucaps.h: $(src)/gen-cpucaps.awk $(src)/cpucaps FORCE + $(call if_changed,gen_cpucaps) diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps new file mode 100644 index 0000000000000..21fbdda7086e2 --- /dev/null +++ b/arch/arm64/tools/cpucaps @@ -0,0 +1,65 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Internal CPU capabilities constants, keep this list sorted + +BTI +HAS_32BIT_EL0 +HAS_32BIT_EL1 +HAS_ADDRESS_AUTH +HAS_ADDRESS_AUTH_ARCH +HAS_ADDRESS_AUTH_IMP_DEF +HAS_AMU_EXTN +HAS_ARMv8_4_TTL +HAS_CACHE_DIC +HAS_CACHE_IDC +HAS_CNP +HAS_CRC32 +HAS_DCPODP +HAS_DCPOP +HAS_E0PD +HAS_EPAN +HAS_GENERIC_AUTH +HAS_GENERIC_AUTH_ARCH +HAS_GENERIC_AUTH_IMP_DEF +HAS_IRQ_PRIO_MASKING +HAS_LDAPR +HAS_LSE_ATOMICS +HAS_NO_FPSIMD +HAS_NO_HW_PREFETCH +HAS_PAN +HAS_RAS_EXTN +HAS_RNG +HAS_SB +HAS_STAGE2_FWB +HAS_SYSREG_GIC_CPUIF +HAS_TLB_RANGE +HAS_VIRT_HOST_EXTN +HW_DBM +KVM_PROTECTED_MODE +MISMATCHED_CACHE_TYPE +MTE +SPECTRE_V2 +SPECTRE_V3A +SPECTRE_V4 +SSBS +SVE +UNMAP_KERNEL_AT_EL0 +WORKAROUND_834220 +WORKAROUND_843419 +WORKAROUND_845719 +WORKAROUND_858921 +WORKAROUND_1418040 +WORKAROUND_1463225 +WORKAROUND_1508412 +WORKAROUND_1542419 +WORKAROUND_CAVIUM_23154 +WORKAROUND_CAVIUM_27456 +WORKAROUND_CAVIUM_30115 +WORKAROUND_CAVIUM_TX2_219_PRFM +WORKAROUND_CAVIUM_TX2_219_TVM +WORKAROUND_CLEAN_CACHE +WORKAROUND_DEVICE_LOAD_ACQUIRE +WORKAROUND_NVIDIA_CARMEL_CNP +WORKAROUND_QCOM_FALKOR_E1003 +WORKAROUND_REPEAT_TLBI +WORKAROUND_SPECULATIVE_AT diff --git a/arch/arm64/tools/gen-cpucaps.awk b/arch/arm64/tools/gen-cpucaps.awk new file mode 100755 index 0000000000000..18737a1ce0448 --- /dev/null +++ b/arch/arm64/tools/gen-cpucaps.awk @@ -0,0 +1,40 @@ +#!/bin/awk -f +# SPDX-License-Identifier: GPL-2.0 +# gen-cpucaps.awk: arm64 cpucaps header generator +# +# Usage: awk -f gen-cpucaps.awk cpucaps.txt + +# Log an error and terminate +function fatal(msg) { + print "Error at line " NR ": " msg > "/dev/stderr" + exit 1 +} + +# skip blank lines and comment lines +/^$/ { next } +/^#/ { next } + +BEGIN { + print "#ifndef __ASM_CPUCAPS_H" + print "#define __ASM_CPUCAPS_H" + print "" + print "/* Generated file - do not edit */" + cap_num = 0 + print "" +} + +/^[vA-Z0-9_]+$/ { + printf("#define ARM64_%-30s\t%d\n", $0, cap_num++) + next +} + +END { + printf("#define ARM64_NCAPS\t\t\t\t%d\n", cap_num) + print "" + print "#endif" +} + +# Any lines not handled by previous rules are unexpected +{ + fatal("unhandled statement") +} From a1bed090fc56e6e24517d96bc076595544fb5317 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Fri, 7 May 2021 17:25:42 +0100 Subject: [PATCH 075/263] kselftest/arm64: Add missing stddef.h include to BTI tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Explicitly include stddef.h when building the BTI tests so that we have a definition of NULL, with at least some toolchains this is not done implicitly by anything else: test.c: In function ‘start’: test.c:214:25: error: ‘NULL’ undeclared (first use in this function) 214 | sigaction(SIGILL, &sa, NULL); | ^~~~ test.c:20:1: note: ‘NULL’ is defined in header ‘’; did you forget to ‘#include ’? Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20210507162542.23149-1-broonie@kernel.org Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/bti/test.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/arm64/bti/test.c b/tools/testing/selftests/arm64/bti/test.c index 656b04976ccc6..67b77ab83c20e 100644 --- a/tools/testing/selftests/arm64/bti/test.c +++ b/tools/testing/selftests/arm64/bti/test.c @@ -6,6 +6,7 @@ #include "system.h" +#include #include #include #include From e5af36b2adb858e982d78d41d7363d05d951a19a Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 21 Apr 2021 19:40:56 +0200 Subject: [PATCH 076/263] cpufreq: intel_pstate: Use HWP if enabled by platform firmware It turns out that there are systems where HWP is enabled during initialization by the platform firmware (BIOS), but HWP EPP support is not advertised. After commit 7aa1031223bc ("cpufreq: intel_pstate: Avoid enabling HWP if EPP is not supported") intel_pstate refuses to use HWP on those systems, but the fallback PERF_CTL interface does not work on them either because of enabled HWP, and once enabled, HWP cannot be disabled. Consequently, the users of those systems cannot control CPU performance scaling. Address this issue by making intel_pstate use HWP unconditionally if it is enabled already when the driver starts. Fixes: 7aa1031223bc ("cpufreq: intel_pstate: Avoid enabling HWP if EPP is not supported") Reported-by: Srinivas Pandruvada Tested-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki Cc: 5.9+ # 5.9+ --- drivers/cpufreq/intel_pstate.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index f0401064d7aa5..0e69dffd5a767 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -3033,6 +3033,14 @@ static const struct x86_cpu_id hwp_support_ids[] __initconst = { {} }; +static bool intel_pstate_hwp_is_enabled(void) +{ + u64 value; + + rdmsrl(MSR_PM_ENABLE, value); + return !!(value & 0x1); +} + static int __init intel_pstate_init(void) { const struct x86_cpu_id *id; @@ -3051,8 +3059,12 @@ static int __init intel_pstate_init(void) * Avoid enabling HWP for processors without EPP support, * because that means incomplete HWP implementation which is a * corner case and supporting it is generally problematic. + * + * If HWP is enabled already, though, there is no choice but to + * deal with it. */ - if (!no_hwp && boot_cpu_has(X86_FEATURE_HWP_EPP)) { + if ((!no_hwp && boot_cpu_has(X86_FEATURE_HWP_EPP)) || + intel_pstate_hwp_is_enabled()) { hwp_active++; hwp_mode_bdw = id->driver_data; intel_pstate.attr = hwp_cpufreq_attrs; From a3bc4ffeedf4693262fe7c6d133dcfcacd3d18c2 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 3 May 2021 11:48:26 -0300 Subject: [PATCH 077/263] tools headers UAPI: Update tools's copy of drm.h headers Picking the changes from: b603e810f740e76b ("drm/uapi: document kernel capabilities") Doesn't result in any tooling changes: $ tools/perf/trace/beauty/drm_ioctl.sh > before $ cp include/uapi/drm/drm.h tools/include/uapi/drm/drm.h $ tools/perf/trace/beauty/drm_ioctl.sh > after $ diff -u before after Silencing these perf build warnings: Warning: Kernel ABI header at 'tools/include/uapi/drm/drm.h' differs from latest version at 'include/uapi/drm/drm.h' diff -u tools/include/uapi/drm/drm.h include/uapi/drm/drm.h Cc: Simon Ser Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/drm/drm.h | 125 +++++++++++++++++++++++++++++++++-- 1 file changed, 121 insertions(+), 4 deletions(-) diff --git a/tools/include/uapi/drm/drm.h b/tools/include/uapi/drm/drm.h index 0827037c54847..67b94bc3c8852 100644 --- a/tools/include/uapi/drm/drm.h +++ b/tools/include/uapi/drm/drm.h @@ -625,30 +625,147 @@ struct drm_gem_open { __u64 size; }; +/** + * DRM_CAP_DUMB_BUFFER + * + * If set to 1, the driver supports creating dumb buffers via the + * &DRM_IOCTL_MODE_CREATE_DUMB ioctl. + */ #define DRM_CAP_DUMB_BUFFER 0x1 +/** + * DRM_CAP_VBLANK_HIGH_CRTC + * + * If set to 1, the kernel supports specifying a CRTC index in the high bits of + * &drm_wait_vblank_request.type. + * + * Starting kernel version 2.6.39, this capability is always set to 1. + */ #define DRM_CAP_VBLANK_HIGH_CRTC 0x2 +/** + * DRM_CAP_DUMB_PREFERRED_DEPTH + * + * The preferred bit depth for dumb buffers. + * + * The bit depth is the number of bits used to indicate the color of a single + * pixel excluding any padding. This is different from the number of bits per + * pixel. For instance, XRGB8888 has a bit depth of 24 but has 32 bits per + * pixel. + * + * Note that this preference only applies to dumb buffers, it's irrelevant for + * other types of buffers. + */ #define DRM_CAP_DUMB_PREFERRED_DEPTH 0x3 +/** + * DRM_CAP_DUMB_PREFER_SHADOW + * + * If set to 1, the driver prefers userspace to render to a shadow buffer + * instead of directly rendering to a dumb buffer. For best speed, userspace + * should do streaming ordered memory copies into the dumb buffer and never + * read from it. + * + * Note that this preference only applies to dumb buffers, it's irrelevant for + * other types of buffers. + */ #define DRM_CAP_DUMB_PREFER_SHADOW 0x4 +/** + * DRM_CAP_PRIME + * + * Bitfield of supported PRIME sharing capabilities. See &DRM_PRIME_CAP_IMPORT + * and &DRM_PRIME_CAP_EXPORT. + * + * PRIME buffers are exposed as dma-buf file descriptors. See + * Documentation/gpu/drm-mm.rst, section "PRIME Buffer Sharing". + */ #define DRM_CAP_PRIME 0x5 +/** + * DRM_PRIME_CAP_IMPORT + * + * If this bit is set in &DRM_CAP_PRIME, the driver supports importing PRIME + * buffers via the &DRM_IOCTL_PRIME_FD_TO_HANDLE ioctl. + */ #define DRM_PRIME_CAP_IMPORT 0x1 +/** + * DRM_PRIME_CAP_EXPORT + * + * If this bit is set in &DRM_CAP_PRIME, the driver supports exporting PRIME + * buffers via the &DRM_IOCTL_PRIME_HANDLE_TO_FD ioctl. + */ #define DRM_PRIME_CAP_EXPORT 0x2 +/** + * DRM_CAP_TIMESTAMP_MONOTONIC + * + * If set to 0, the kernel will report timestamps with ``CLOCK_REALTIME`` in + * struct drm_event_vblank. If set to 1, the kernel will report timestamps with + * ``CLOCK_MONOTONIC``. See ``clock_gettime(2)`` for the definition of these + * clocks. + * + * Starting from kernel version 2.6.39, the default value for this capability + * is 1. Starting kernel version 4.15, this capability is always set to 1. + */ #define DRM_CAP_TIMESTAMP_MONOTONIC 0x6 +/** + * DRM_CAP_ASYNC_PAGE_FLIP + * + * If set to 1, the driver supports &DRM_MODE_PAGE_FLIP_ASYNC. + */ #define DRM_CAP_ASYNC_PAGE_FLIP 0x7 -/* - * The CURSOR_WIDTH and CURSOR_HEIGHT capabilities return a valid widthxheight - * combination for the hardware cursor. The intention is that a hardware - * agnostic userspace can query a cursor plane size to use. +/** + * DRM_CAP_CURSOR_WIDTH + * + * The ``CURSOR_WIDTH`` and ``CURSOR_HEIGHT`` capabilities return a valid + * width x height combination for the hardware cursor. The intention is that a + * hardware agnostic userspace can query a cursor plane size to use. * * Note that the cross-driver contract is to merely return a valid size; * drivers are free to attach another meaning on top, eg. i915 returns the * maximum plane size. */ #define DRM_CAP_CURSOR_WIDTH 0x8 +/** + * DRM_CAP_CURSOR_HEIGHT + * + * See &DRM_CAP_CURSOR_WIDTH. + */ #define DRM_CAP_CURSOR_HEIGHT 0x9 +/** + * DRM_CAP_ADDFB2_MODIFIERS + * + * If set to 1, the driver supports supplying modifiers in the + * &DRM_IOCTL_MODE_ADDFB2 ioctl. + */ #define DRM_CAP_ADDFB2_MODIFIERS 0x10 +/** + * DRM_CAP_PAGE_FLIP_TARGET + * + * If set to 1, the driver supports the &DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE and + * &DRM_MODE_PAGE_FLIP_TARGET_RELATIVE flags in + * &drm_mode_crtc_page_flip_target.flags for the &DRM_IOCTL_MODE_PAGE_FLIP + * ioctl. + */ #define DRM_CAP_PAGE_FLIP_TARGET 0x11 +/** + * DRM_CAP_CRTC_IN_VBLANK_EVENT + * + * If set to 1, the kernel supports reporting the CRTC ID in + * &drm_event_vblank.crtc_id for the &DRM_EVENT_VBLANK and + * &DRM_EVENT_FLIP_COMPLETE events. + * + * Starting kernel version 4.12, this capability is always set to 1. + */ #define DRM_CAP_CRTC_IN_VBLANK_EVENT 0x12 +/** + * DRM_CAP_SYNCOBJ + * + * If set to 1, the driver supports sync objects. See + * Documentation/gpu/drm-mm.rst, section "DRM Sync Objects". + */ #define DRM_CAP_SYNCOBJ 0x13 +/** + * DRM_CAP_SYNCOBJ_TIMELINE + * + * If set to 1, the driver supports timeline operations on sync objects. See + * Documentation/gpu/drm-mm.rst, section "DRM Sync Objects". + */ #define DRM_CAP_SYNCOBJ_TIMELINE 0x14 /* DRM_IOCTL_GET_CAP ioctl argument type */ From 0fdee797d60d71e5a6fd59aa573d84a858e715dd Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 3 May 2021 11:51:17 -0300 Subject: [PATCH 078/263] tools headers UAPI: Sync drm/i915_drm.h with the kernel sources To pick the changes in: b5b6f6a610127b17 ("drm/i915/gem: Drop legacy execbuffer support (v2)") That don't result in any change in tooling as this is just adding a comment. Only silences this perf build warning: Warning: Kernel ABI header at 'tools/include/uapi/drm/i915_drm.h' differs from latest version at 'include/uapi/drm/i915_drm.h' diff -u tools/include/uapi/drm/i915_drm.h include/uapi/drm/i915_drm.h Cc: Daniel Vetter Cc: Jason Ekstrand Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/drm/i915_drm.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/include/uapi/drm/i915_drm.h b/tools/include/uapi/drm/i915_drm.h index 1987e2ea79a3b..ddc47bbf48b6d 100644 --- a/tools/include/uapi/drm/i915_drm.h +++ b/tools/include/uapi/drm/i915_drm.h @@ -943,6 +943,7 @@ struct drm_i915_gem_exec_object { __u64 offset; }; +/* DRM_IOCTL_I915_GEM_EXECBUFFER was removed in Linux 5.13 */ struct drm_i915_gem_execbuffer { /** * List of buffers to be validated with their relocations to be From b3172585b13d7171c32cfabdf938eca7fdfe9b31 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 3 May 2021 11:53:37 -0300 Subject: [PATCH 079/263] tools arch x86: Sync the msr-index.h copy with the kernel sources To pick up the changes from these csets: d0946a882e622022 ("perf/x86/intel: Hybrid PMU support for perf capabilities") That cause no changes to tooling as it isn't adding any new MSR, just some capabilities for a pre-existing one: $ tools/perf/trace/beauty/tracepoints/x86_msr.sh > before $ cp arch/x86/include/asm/msr-index.h tools/arch/x86/include/asm/msr-index.h $ tools/perf/trace/beauty/tracepoints/x86_msr.sh > after $ diff -u before after $ Just silences this perf build warning: Warning: Kernel ABI header at 'tools/arch/x86/include/asm/msr-index.h' differs from latest version at 'arch/x86/include/asm/msr-index.h' diff -u tools/arch/x86/include/asm/msr-index.h arch/x86/include/asm/msr-index.h Cc: Kan Liang Cc: Peter Zijlstra Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/asm/msr-index.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index 45029354e0a8b..742d89a00721d 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -185,6 +185,9 @@ #define MSR_PEBS_DATA_CFG 0x000003f2 #define MSR_IA32_DS_AREA 0x00000600 #define MSR_IA32_PERF_CAPABILITIES 0x00000345 +#define PERF_CAP_METRICS_IDX 15 +#define PERF_CAP_PT_IDX 16 + #define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 #define MSR_IA32_RTIT_CTL 0x00000570 @@ -265,6 +268,7 @@ #define DEBUGCTLMSR_LBR (1UL << 0) /* last branch recording */ #define DEBUGCTLMSR_BTF_SHIFT 1 #define DEBUGCTLMSR_BTF (1UL << 1) /* single-step on branches */ +#define DEBUGCTLMSR_BUS_LOCK_DETECT (1UL << 2) #define DEBUGCTLMSR_TR (1UL << 6) #define DEBUGCTLMSR_BTS (1UL << 7) #define DEBUGCTLMSR_BTINT (1UL << 8) From e8c1167606c63fd8f9934d0b6ce80281463a4945 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 2 Apr 2021 18:40:20 +0900 Subject: [PATCH 080/263] perf record: Disallow -c and -F option at the same time It's confusing which one is effective when the both options are given. The current code happens to use -c in this case but users might not be aware of it. We can change it to complain about that instead of relying on the implicit priority. Before: $ perf record -c 111111 -F 99 true [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.031 MB perf.data (8 samples) ] $ perf evlist -F cycles: sample_period=111111 $ After: $ perf record -c 111111 -F 99 true cannot set frequency and period at the same time $ So this change can break existing usages, but I think it's rare to have both options and it'd be better changing them. Suggested-by: Alexey Alexandrov Signed-off-by: Namhyung Kim Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ian Rogers Cc: Jiri Olsa Cc: Mark Rutland Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20210402094020.28164-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/record.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/record.c b/tools/perf/util/record.c index f99852d54b147..43e5b563dee89 100644 --- a/tools/perf/util/record.c +++ b/tools/perf/util/record.c @@ -157,9 +157,15 @@ static int get_max_rate(unsigned int *rate) static int record_opts__config_freq(struct record_opts *opts) { bool user_freq = opts->user_freq != UINT_MAX; + bool user_interval = opts->user_interval != ULLONG_MAX; unsigned int max_rate; - if (opts->user_interval != ULLONG_MAX) + if (user_interval && user_freq) { + pr_err("cannot set frequency and period at the same time\n"); + return -1; + } + + if (user_interval) opts->default_interval = opts->user_interval; if (user_freq) opts->freq = opts->user_freq; From 7aa3c9eabdf76017679e975e2ffd50cde3c010b8 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 6 May 2021 15:56:40 -0700 Subject: [PATCH 081/263] perf jevents: Silence warning for ArchStd files JSON files in the level 1 directory are used for ArchStd events (see preprocess_arch_std_files), as such they shouldn't be warned about. Signed-off-by: Ian Rogers Reviewed-by: John Garry Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Joakim Zhang Cc: Kajol Jain Cc: Kim Phillips Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20210506225640.1461000-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/jevents.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/perf/pmu-events/jevents.c b/tools/perf/pmu-events/jevents.c index ed4f0bd72e5a3..7422b0ea87901 100644 --- a/tools/perf/pmu-events/jevents.c +++ b/tools/perf/pmu-events/jevents.c @@ -1123,8 +1123,10 @@ static int process_one_file(const char *fpath, const struct stat *sb, mapfile = strdup(fpath); return 0; } - - pr_info("%s: Ignoring file %s\n", prog, fpath); + if (is_json_file(bname)) + pr_debug("%s: ArchStd json is preprocessed %s\n", prog, fpath); + else + pr_info("%s: Ignoring file %s\n", prog, fpath); return 0; } From a11c9a6e472457cf9eeafb585fc5c912f51d1b23 Mon Sep 17 00:00:00 2001 From: Dmitry Koshelev Date: Thu, 6 May 2021 13:11:49 +0000 Subject: [PATCH 082/263] perf session: Fix swapping of cpu_map and stat_config records 'data' field in perf_record_cpu_map_data struct is 16-bit wide and so should be swapped using bswap_16(). 'nr' field in perf_record_stat_config struct should be swapped before being used for size calculation. Signed-off-by: Dmitry Koshelev Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20210506131244.13328-1-karaghiozis@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/session.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index a12cf4f0e97a7..106b3d60881a5 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -904,7 +904,7 @@ static void perf_event__cpu_map_swap(union perf_event *event, struct perf_record_record_cpu_map *mask; unsigned i; - data->type = bswap_64(data->type); + data->type = bswap_16(data->type); switch (data->type) { case PERF_CPU_MAP__CPUS: @@ -937,7 +937,7 @@ static void perf_event__stat_config_swap(union perf_event *event, { u64 size; - size = event->stat_config.nr * sizeof(event->stat_config.data[0]); + size = bswap_64(event->stat_config.nr) * sizeof(event->stat_config.data[0]); size += 1; /* nr item itself */ mem_bswap_64(&event->stat_config.nr, size); } From ad1237c30d975535a669746496cbed136aa5a045 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 8 May 2021 22:50:20 +0200 Subject: [PATCH 083/263] perf tools: Fix dynamic libbpf link Justin reported broken build with LIBBPF_DYNAMIC=1. When linking libbpf dynamically we need to use perf's hashmap object, because it's not exported in libbpf.so (only in libbpf.a). Following build is now passing: $ make LIBBPF_DYNAMIC=1 BUILD: Doing 'make -j8' parallel build ... $ ldd perf | grep libbpf libbpf.so.0 => /lib64/libbpf.so.0 (0x00007fa7630db000) Fixes: eee19501926d ("perf tools: Grab a copy of libbpf's hashmap") Reported-by: Justin M. Forbes Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Ian Rogers Cc: Mark Rutland Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20210508205020.617984-1-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.config | 1 + tools/perf/util/Build | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index 0d6619064a838..406a9519145e5 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -540,6 +540,7 @@ ifndef NO_LIBELF ifdef LIBBPF_DYNAMIC ifeq ($(feature-libbpf), 1) EXTLIBS += -lbpf + $(call detected,CONFIG_LIBBPF_DYNAMIC) else dummy := $(error Error: No libbpf devel library found, please install libbpf-devel); endif diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 8c0d9f368ebcf..b64bdc1a7026d 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -145,7 +145,14 @@ perf-$(CONFIG_LIBELF) += symbol-elf.o perf-$(CONFIG_LIBELF) += probe-file.o perf-$(CONFIG_LIBELF) += probe-event.o +ifdef CONFIG_LIBBPF_DYNAMIC + hashmap := 1 +endif ifndef CONFIG_LIBBPF + hashmap := 1 +endif + +ifdef hashmap perf-y += hashmap.o endif From 0d943d5fde6070c2661a99618ea95b99655589ad Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sun, 9 May 2021 09:39:02 -0300 Subject: [PATCH 084/263] tools headers UAPI: Sync linux/kvm.h with the kernel sources To pick the changes in: 15fb7de1a7f5af0d ("KVM: SVM: Add KVM_SEV_RECEIVE_UPDATE_DATA command") 3bf725699bf62494 ("KVM: arm64: Add support for the KVM PTP service") 4cfdd47d6d95aca4 ("KVM: SVM: Add KVM_SEV SEND_START command") 54526d1fd59338fd ("KVM: x86: Support KVM VMs sharing SEV context") 5569e2e7a650dfff ("KVM: SVM: Add support for KVM_SEV_SEND_CANCEL command") 8b13c36493d8cb56 ("KVM: introduce KVM_CAP_SET_GUEST_DEBUG2") af43cbbf954b50ca ("KVM: SVM: Add support for KVM_SEV_RECEIVE_START command") d3d1af85e2c75bb5 ("KVM: SVM: Add KVM_SEND_UPDATE_DATA command") fe7e948837f312d8 ("KVM: x86: Add capability to grant VM access to privileged SGX attribute") That don't cause any change in tooling as it doesn't introduce any new ioctl. $ grep kvm tools/perf/trace/beauty/*.sh tools/perf/trace/beauty/kvm_ioctl.sh:printf "static const char *kvm_ioctl_cmds[] = {\n" tools/perf/trace/beauty/kvm_ioctl.sh:egrep $regex ${header_dir}/kvm.h | \ $ $ tools/perf/trace/beauty/kvm_ioctl.sh > before $ cp include/uapi/linux/kvm.h tools/include/uapi/linux/kvm.h $ tools/perf/trace/beauty/kvm_ioctl.sh > after $ diff -u before after $ This silences this perf build warning: Warning: Kernel ABI header at 'tools/include/uapi/linux/kvm.h' differs from latest version at 'include/uapi/linux/kvm.h' diff -u tools/include/uapi/linux/kvm.h include/uapi/linux/kvm.h Cc: Brijesh Singh Cc: Jianyong Wu Cc: Marc Zyngier Cc: Nathan Tempelman Cc: Paolo Bonzini Cc: Sean Christopherson Cc: Steve Rutherford Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/kvm.h | 45 ++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index f6afee209620d..3fd9a7e9d90cd 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -1078,6 +1078,10 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_DIRTY_LOG_RING 192 #define KVM_CAP_X86_BUS_LOCK_EXIT 193 #define KVM_CAP_PPC_DAWR1 194 +#define KVM_CAP_SET_GUEST_DEBUG2 195 +#define KVM_CAP_SGX_ATTRIBUTE 196 +#define KVM_CAP_VM_COPY_ENC_CONTEXT_FROM 197 +#define KVM_CAP_PTP_KVM 198 #ifdef KVM_CAP_IRQ_ROUTING @@ -1671,6 +1675,8 @@ enum sev_cmd_id { KVM_SEV_CERT_EXPORT, /* Attestation report */ KVM_SEV_GET_ATTESTATION_REPORT, + /* Guest Migration Extension */ + KVM_SEV_SEND_CANCEL, KVM_SEV_NR_MAX, }; @@ -1729,6 +1735,45 @@ struct kvm_sev_attestation_report { __u32 len; }; +struct kvm_sev_send_start { + __u32 policy; + __u64 pdh_cert_uaddr; + __u32 pdh_cert_len; + __u64 plat_certs_uaddr; + __u32 plat_certs_len; + __u64 amd_certs_uaddr; + __u32 amd_certs_len; + __u64 session_uaddr; + __u32 session_len; +}; + +struct kvm_sev_send_update_data { + __u64 hdr_uaddr; + __u32 hdr_len; + __u64 guest_uaddr; + __u32 guest_len; + __u64 trans_uaddr; + __u32 trans_len; +}; + +struct kvm_sev_receive_start { + __u32 handle; + __u32 policy; + __u64 pdh_uaddr; + __u32 pdh_len; + __u64 session_uaddr; + __u32 session_len; +}; + +struct kvm_sev_receive_update_data { + __u64 hdr_uaddr; + __u32 hdr_len; + __u64 guest_uaddr; + __u32 guest_len; + __u64 trans_uaddr; + __u32 trans_len; +}; + #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) #define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) #define KVM_DEV_ASSIGN_MASK_INTX (1 << 2) From b35629bc2fd59691504debda99c320cf966c8e3a Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sun, 9 May 2021 09:45:25 -0300 Subject: [PATCH 085/263] tools headers kvm: Sync kvm headers with the kernel sources To pick the changes in: 3c0c2ad1ae75963c ("KVM: VMX: Add basic handling of VM-Exit from SGX enclave") None of them trigger any changes in tooling, this time this is just to silence these perf build warnings: Warning: Kernel ABI header at 'tools/arch/x86/include/uapi/asm/vmx.h' differs from latest version at 'arch/x86/include/uapi/asm/vmx.h' diff -u tools/arch/x86/include/uapi/asm/vmx.h arch/x86/include/uapi/asm/vmx.h Cc: Paolo Bonzini Cc: Sean Christopherson Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/uapi/asm/vmx.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/arch/x86/include/uapi/asm/vmx.h b/tools/arch/x86/include/uapi/asm/vmx.h index b8e650a985e35..946d761adbd3d 100644 --- a/tools/arch/x86/include/uapi/asm/vmx.h +++ b/tools/arch/x86/include/uapi/asm/vmx.h @@ -27,6 +27,7 @@ #define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000 +#define VMX_EXIT_REASONS_SGX_ENCLAVE_MODE 0x08000000 #define EXIT_REASON_EXCEPTION_NMI 0 #define EXIT_REASON_EXTERNAL_INTERRUPT 1 From a00b7e39d6b56e6f49cdd51a9ebf92627a19d877 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 7 May 2021 17:54:35 +0900 Subject: [PATCH 086/263] perf tools: Fix a build error on arm64 with clang Since clang's -Wmissing-field-initializers warns if a data structure is initialized with a signle NULL as below, ---- tools/perf $ make CC=clang LLVM=1 ... arch/arm64/util/kvm-stat.c:74:9: error: missing field 'ops' initializer [-Werror,-Wmissing-field-initializers] { NULL }, ^ 1 error generated. ---- add another field initializer expressly as same as other arch's kvm-stat.c code. Signed-off-by: Masami Hiramatsu Cc: Anders Roxell Cc: Leo Yan Cc: Sergey Senozhatsky Link: http://lore.kernel.org/lkml/162037767540.94840.15758657049033010518.stgit@devnote2 Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm64/util/kvm-stat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/arch/arm64/util/kvm-stat.c b/tools/perf/arch/arm64/util/kvm-stat.c index 2303256b7d05e..73d18e0ed6f6a 100644 --- a/tools/perf/arch/arm64/util/kvm-stat.c +++ b/tools/perf/arch/arm64/util/kvm-stat.c @@ -71,7 +71,7 @@ struct kvm_reg_events_ops kvm_reg_events_ops[] = { .name = "vmexit", .ops = &exit_events, }, - { NULL }, + { NULL, NULL }, }; const char * const kvm_skip_events[] = { From f8bcb061ea013a9b39a071b9dd9f6ea0aa2caf72 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sun, 9 May 2021 09:55:30 -0300 Subject: [PATCH 087/263] tools headers UAPI: Sync files changed by landlock, quotactl_path and mount_settattr new syscalls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To pick the changes in these csets: a49f4f81cb48925e ("arch: Wire up Landlock syscalls") 2a1867219c7b27f9 ("fs: add mount_setattr()") fa8b90070a80bb1a ("quota: wire up quotactl_path") That silences these perf build warnings and add support for those new syscalls in tools such as 'perf trace'. For instance, this is now possible: # ~acme/bin/perf trace -v -e landlock* event qualifier tracepoint filter: (common_pid != 129365 && common_pid != 3502) && (id == 444 || id == 445 || id == 446) ^C# That is tha filter expression attached to the raw_syscalls:sys_{enter,exit} tracepoints. $ grep landlock tools/perf/arch/x86/entry/syscalls/syscall_64.tbl 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self $ This addresses these perf build warnings: Warning: Kernel ABI header at 'tools/include/uapi/asm-generic/unistd.h' differs from latest version at 'include/uapi/asm-generic/unistd.h' diff -u tools/include/uapi/asm-generic/unistd.h include/uapi/asm-generic/unistd.h Warning: Kernel ABI header at 'tools/perf/arch/x86/entry/syscalls/syscall_64.tbl' differs from latest version at 'arch/x86/entry/syscalls/syscall_64.tbl' diff -u tools/perf/arch/x86/entry/syscalls/syscall_64.tbl arch/x86/entry/syscalls/syscall_64.tbl Warning: Kernel ABI header at 'tools/perf/arch/powerpc/entry/syscalls/syscall.tbl' differs from latest version at 'arch/powerpc/kernel/syscalls/syscall.tbl' diff -u tools/perf/arch/powerpc/entry/syscalls/syscall.tbl arch/powerpc/kernel/syscalls/syscall.tbl Warning: Kernel ABI header at 'tools/perf/arch/s390/entry/syscalls/syscall.tbl' differs from latest version at 'arch/s390/kernel/syscalls/syscall.tbl' diff -u tools/perf/arch/s390/entry/syscalls/syscall.tbl arch/s390/kernel/syscalls/syscall.tbl Warning: Kernel ABI header at 'tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl' differs from latest version at 'arch/mips/kernel/syscalls/syscall_n64.tbl' diff -u tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl arch/mips/kernel/syscalls/syscall_n64.tbl Cc: Christian Brauner Cc: James Morris Cc: Jan Kara Cc: Mickaël Salaün Cc: Sascha Hauer Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/asm-generic/unistd.h | 11 ++++++++++- tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl | 5 +++++ tools/perf/arch/powerpc/entry/syscalls/syscall.tbl | 4 ++++ tools/perf/arch/s390/entry/syscalls/syscall.tbl | 4 ++++ tools/perf/arch/x86/entry/syscalls/syscall_64.tbl | 4 ++++ 5 files changed, 27 insertions(+), 1 deletion(-) diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h index ce58cff99b665..6de5a7fc066b8 100644 --- a/tools/include/uapi/asm-generic/unistd.h +++ b/tools/include/uapi/asm-generic/unistd.h @@ -863,9 +863,18 @@ __SYSCALL(__NR_process_madvise, sys_process_madvise) __SC_COMP(__NR_epoll_pwait2, sys_epoll_pwait2, compat_sys_epoll_pwait2) #define __NR_mount_setattr 442 __SYSCALL(__NR_mount_setattr, sys_mount_setattr) +#define __NR_quotactl_path 443 +__SYSCALL(__NR_quotactl_path, sys_quotactl_path) + +#define __NR_landlock_create_ruleset 444 +__SYSCALL(__NR_landlock_create_ruleset, sys_landlock_create_ruleset) +#define __NR_landlock_add_rule 445 +__SYSCALL(__NR_landlock_add_rule, sys_landlock_add_rule) +#define __NR_landlock_restrict_self 446 +__SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self) #undef __NR_syscalls -#define __NR_syscalls 443 +#define __NR_syscalls 447 /* * 32 bit systems traditionally used different diff --git a/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl b/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl index 91649690b52f1..9974f5f8e49bc 100644 --- a/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl +++ b/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl @@ -356,3 +356,8 @@ 439 n64 faccessat2 sys_faccessat2 440 n64 process_madvise sys_process_madvise 441 n64 epoll_pwait2 sys_epoll_pwait2 +442 n64 mount_setattr sys_mount_setattr +443 n64 quotactl_path sys_quotactl_path +444 n64 landlock_create_ruleset sys_landlock_create_ruleset +445 n64 landlock_add_rule sys_landlock_add_rule +446 n64 landlock_restrict_self sys_landlock_restrict_self diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl index 0b2480cf3e479..2e68fbb57cc66 100644 --- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl @@ -522,3 +522,7 @@ 440 common process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 442 common mount_setattr sys_mount_setattr +443 common quotactl_path sys_quotactl_path +444 common landlock_create_ruleset sys_landlock_create_ruleset +445 common landlock_add_rule sys_landlock_add_rule +446 common landlock_restrict_self sys_landlock_restrict_self diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl index 3abef2144dac7..7e4a2aba366df 100644 --- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl @@ -445,3 +445,7 @@ 440 common process_madvise sys_process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 442 common mount_setattr sys_mount_setattr sys_mount_setattr +443 common quotactl_path sys_quotactl_path sys_quotactl_path +444 common landlock_create_ruleset sys_landlock_create_ruleset sys_landlock_create_ruleset +445 common landlock_add_rule sys_landlock_add_rule sys_landlock_add_rule +446 common landlock_restrict_self sys_landlock_restrict_self sys_landlock_restrict_self diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl index 7bf01cbe582f0..ecd551b08d052 100644 --- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl +++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl @@ -364,6 +364,10 @@ 440 common process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 442 common mount_setattr sys_mount_setattr +443 common quotactl_path sys_quotactl_path +444 common landlock_create_ruleset sys_landlock_create_ruleset +445 common landlock_add_rule sys_landlock_add_rule +446 common landlock_restrict_self sys_landlock_restrict_self # # Due to a historical design error, certain syscalls are numbered differently From 5a80ee4219a52194f0e815bbceec40eb32c523ec Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sun, 9 May 2021 10:06:40 -0300 Subject: [PATCH 088/263] tools headers UAPI: Sync linux/prctl.h with the kernel sources To pick a new prctl introduced in: 201698626fbca1cf ("arm64: Introduce prctl(PR_PAC_{SET,GET}_ENABLED_KEYS)") That results in $ grep prctl tools/perf/trace/beauty/*.sh tools/perf/trace/beauty/prctl_option.sh:printf "static const char *prctl_options[] = {\n" tools/perf/trace/beauty/prctl_option.sh:egrep $regex ${header_dir}/prctl.h | grep -v PR_SET_PTRACER | \ tools/perf/trace/beauty/prctl_option.sh:printf "static const char *prctl_set_mm_options[] = {\n" tools/perf/trace/beauty/prctl_option.sh:egrep $regex ${header_dir}/prctl.h | \ tools/perf/trace/beauty/x86_arch_prctl.sh:prctl_arch_header=${x86_header_dir}/prctl.h tools/perf/trace/beauty/x86_arch_prctl.sh: printf "#define x86_arch_prctl_codes_%d_offset %s\n" $idx $first_entry tools/perf/trace/beauty/x86_arch_prctl.sh: printf "static const char *x86_arch_prctl_codes_%d[] = {\n" $idx tools/perf/trace/beauty/x86_arch_prctl.sh: egrep -q $regex ${prctl_arch_header} && \ tools/perf/trace/beauty/x86_arch_prctl.sh: (egrep $regex ${prctl_arch_header} | \ $ tools/perf/trace/beauty/prctl_option.sh > before $ cp include/uapi/linux/prctl.h tools/include/uapi/linux/prctl.h $ tools/perf/trace/beauty/prctl_option.sh > after $ diff -u before after --- before 2021-05-09 10:06:10.064559675 -0300 +++ after 2021-05-09 10:06:21.319791396 -0300 @@ -54,6 +54,8 @@ [57] = "SET_IO_FLUSHER", [58] = "GET_IO_FLUSHER", [59] = "SET_SYSCALL_USER_DISPATCH", + [60] = "PAC_SET_ENABLED_KEYS", + [61] = "PAC_GET_ENABLED_KEYS", }; static const char *prctl_set_mm_options[] = { [1] = "START_CODE", $ Now users can do: # perf trace -e syscalls:sys_enter_prctl --filter "option==PAC_GET_ENABLED_KEYS" ^C# # trace -v -e syscalls:sys_enter_prctl --filter "option==PAC_GET_ENABLED_KEYS" New filter for syscalls:sys_enter_prctl: (option==0x3d) && (common_pid != 5519 && common_pid != 3404) ^C# And also when prctl appears in a session, its options will be translated to the string. Cc: Catalin Marinas Cc: Peter Collingbourne Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/prctl.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h index 667f1aed091c2..18a9f59dc067f 100644 --- a/tools/include/uapi/linux/prctl.h +++ b/tools/include/uapi/linux/prctl.h @@ -255,4 +255,8 @@ struct prctl_mm_map { # define SYSCALL_DISPATCH_FILTER_ALLOW 0 # define SYSCALL_DISPATCH_FILTER_BLOCK 1 +/* Set/get enabled arm64 pointer authentication keys */ +#define PR_PAC_SET_ENABLED_KEYS 60 +#define PR_PAC_GET_ENABLED_KEYS 61 + #endif /* _LINUX_PRCTL_H */ From fb24e308b6310541e70d11a3f19dc40742974b95 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sun, 9 May 2021 10:19:37 -0300 Subject: [PATCH 089/263] tools arch: Update arch/x86/lib/mem{cpy,set}_64.S copies used in 'perf bench mem memcpy' To bring in the change made in this cset: 5e21a3ecad1500e3 ("x86/alternative: Merge include files") This just silences these perf tools build warnings, no change in the tools: Warning: Kernel ABI header at 'tools/arch/x86/lib/memcpy_64.S' differs from latest version at 'arch/x86/lib/memcpy_64.S' diff -u tools/arch/x86/lib/memcpy_64.S arch/x86/lib/memcpy_64.S Warning: Kernel ABI header at 'tools/arch/x86/lib/memset_64.S' differs from latest version at 'arch/x86/lib/memset_64.S' diff -u tools/arch/x86/lib/memset_64.S arch/x86/lib/memset_64.S Cc: Borislav Petkov Cc: Juergen Gross Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/lib/memcpy_64.S | 2 +- tools/arch/x86/lib/memset_64.S | 2 +- tools/include/asm/{alternative-asm.h => alternative.h} | 0 3 files changed, 2 insertions(+), 2 deletions(-) rename tools/include/asm/{alternative-asm.h => alternative.h} (100%) diff --git a/tools/arch/x86/lib/memcpy_64.S b/tools/arch/x86/lib/memcpy_64.S index 1e299ac73c869..1cc9da6e29c79 100644 --- a/tools/arch/x86/lib/memcpy_64.S +++ b/tools/arch/x86/lib/memcpy_64.S @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include .pushsection .noinstr.text, "ax" diff --git a/tools/arch/x86/lib/memset_64.S b/tools/arch/x86/lib/memset_64.S index 0bfd26e4ca9e9..9827ae267f96e 100644 --- a/tools/arch/x86/lib/memset_64.S +++ b/tools/arch/x86/lib/memset_64.S @@ -3,7 +3,7 @@ #include #include -#include +#include #include /* diff --git a/tools/include/asm/alternative-asm.h b/tools/include/asm/alternative.h similarity index 100% rename from tools/include/asm/alternative-asm.h rename to tools/include/asm/alternative.h From 3916329309eace19e8c32bc821064a119474c309 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sun, 9 May 2021 10:21:33 -0300 Subject: [PATCH 090/263] tools include UAPI powerpc: Sync errno.h with the kernel headers To pick the change in: 7de21e679e6a789f ("powerpc: fix EDEADLOCK redefinition error in uapi/asm/errno.h") That will make the errno number -> string tables to pick this change on powerpc. Silencing this perf build warning: Warning: Kernel ABI header at 'tools/arch/powerpc/include/uapi/asm/errno.h' differs from latest version at 'arch/powerpc/include/uapi/asm/errno.h' diff -u tools/arch/powerpc/include/uapi/asm/errno.h arch/powerpc/include/uapi/asm/errno.h Cc: Michael Ellerman Cc: Tony Ambardar Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/powerpc/include/uapi/asm/errno.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/arch/powerpc/include/uapi/asm/errno.h b/tools/arch/powerpc/include/uapi/asm/errno.h index cc79856896a19..4ba87de32be00 100644 --- a/tools/arch/powerpc/include/uapi/asm/errno.h +++ b/tools/arch/powerpc/include/uapi/asm/errno.h @@ -2,6 +2,7 @@ #ifndef _ASM_POWERPC_ERRNO_H #define _ASM_POWERPC_ERRNO_H +#undef EDEADLOCK #include #undef EDEADLOCK From 6faf64f5248166ecaf50107e883c383e0b66bb70 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sun, 9 May 2021 10:24:29 -0300 Subject: [PATCH 091/263] tools headers cpufeatures: Sync with the kernel sources To pick the changes from: 4e6292114c741221 ("x86/paravirt: Add new features for paravirt patching") a161545ab53b174c ("x86/cpufeatures: Enumerate Intel Hybrid Technology feature bit") a89dfde3dc3c2dbf ("x86: Remove dynamic NOP selection") b8921dccf3b25798 ("x86/cpufeatures: Add SGX1 and SGX2 sub-features") f21d4d3b97a86035 ("x86/cpufeatures: Enumerate #DB for bus lock detection") f333374e108e7e4c ("x86/cpufeatures: Add the Virtual SPEC_CTRL feature") This only causes these perf files to be rebuilt: CC /tmp/build/perf/bench/mem-memcpy-x86-64-asm.o CC /tmp/build/perf/bench/mem-memset-x86-64-asm.o And addresses this perf build warning: Warning: Kernel ABI header at 'tools/arch/x86/include/asm/cpufeatures.h' differs from latest version at 'arch/x86/include/asm/cpufeatures.h' diff -u tools/arch/x86/include/asm/cpufeatures.h arch/x86/include/asm/cpufeatures.h Cc: Babu Moger Cc: Borislav Petkov Cc: Fenghua Yu Cc: Juergen Gross Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Ricardo Neri Cc: Sean Christopherson Cc: Thomas Gleixner Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/asm/cpufeatures.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index cc96e26d69f7a..ac37830ae9412 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -84,7 +84,7 @@ /* CPU types for specific tunings: */ #define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */ -#define X86_FEATURE_K7 ( 3*32+ 5) /* "" Athlon */ +/* FREE, was #define X86_FEATURE_K7 ( 3*32+ 5) "" Athlon */ #define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */ #define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */ #define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */ @@ -236,6 +236,8 @@ #define X86_FEATURE_EPT_AD ( 8*32+17) /* Intel Extended Page Table access-dirty bit */ #define X86_FEATURE_VMCALL ( 8*32+18) /* "" Hypervisor supports the VMCALL instruction */ #define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */ +#define X86_FEATURE_PVUNLOCK ( 8*32+20) /* "" PV unlock function */ +#define X86_FEATURE_VCPUPREEMPT ( 8*32+21) /* "" PV vcpu_is_preempted function */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */ #define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ @@ -290,6 +292,8 @@ #define X86_FEATURE_FENCE_SWAPGS_KERNEL (11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */ #define X86_FEATURE_SPLIT_LOCK_DETECT (11*32+ 6) /* #AC for split lock */ #define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */ +#define X86_FEATURE_SGX1 (11*32+ 8) /* "" Basic SGX */ +#define X86_FEATURE_SGX2 (11*32+ 9) /* "" SGX Enclave Dynamic Memory Management (EDMM) */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ @@ -336,6 +340,7 @@ #define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ #define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */ #define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */ +#define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* Virtual SPEC_CTRL */ #define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* "" SVME addr check */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */ @@ -354,6 +359,7 @@ #define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */ #define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */ #define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */ +#define X86_FEATURE_BUS_LOCK_DETECT (16*32+24) /* Bus Lock detect */ #define X86_FEATURE_CLDEMOTE (16*32+25) /* CLDEMOTE instruction */ #define X86_FEATURE_MOVDIRI (16*32+27) /* MOVDIRI instruction */ #define X86_FEATURE_MOVDIR64B (16*32+28) /* MOVDIR64B instruction */ @@ -374,6 +380,7 @@ #define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */ #define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */ #define X86_FEATURE_SERIALIZE (18*32+14) /* SERIALIZE instruction */ +#define X86_FEATURE_HYBRID_CPU (18*32+15) /* "" This part has CPUs of more than one type */ #define X86_FEATURE_TSXLDTRK (18*32+16) /* TSX Suspend Load Address Tracking */ #define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */ #define X86_FEATURE_ARCH_LBR (18*32+19) /* Intel ARCH LBR */ From 71d7924b3e8acaca6a3b0fc3261170031ada3b70 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sun, 9 May 2021 10:29:10 -0300 Subject: [PATCH 092/263] tools headers UAPI: Sync perf_event.h with the kernel sources To pick up the changes in: 2b26f0aa004995f4 ("perf: Support only inheriting events if cloned with CLONE_THREAD") 2e498d0a74e5b88a ("perf: Add support for event removal on exec") 547b60988e631f74 ("perf: aux: Add flags for the buffer format") 55bcf6ef314ae8ba ("perf: Extend PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE") 7dde51767ca5339e ("perf: aux: Add CoreSight PMU buffer formats") 97ba62b278674293 ("perf: Add support for SIGTRAP on perf events") d0d1dd628527c77d ("perf core: Add PERF_COUNT_SW_CGROUP_SWITCHES event") Also change the expected sizeof(struct perf_event_attr) from 120 to 128 due to fields being added for the SIGTRAP changes. Addressing this perf build warning: Warning: Kernel ABI header at 'tools/include/uapi/linux/perf_event.h' differs from latest version at 'include/uapi/linux/perf_event.h' diff -u tools/include/uapi/linux/perf_event.h include/uapi/linux/perf_event.h Cc: Kan Liang Cc: Marco Elver Cc: Mathieu Poirier Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Suzuki K Poulose Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/perf_event.h | 26 ++++++++++++++++++++----- tools/perf/tests/attr/base-record | 2 +- tools/perf/tests/attr/base-stat | 2 +- tools/perf/tests/attr/system-wide-dummy | 2 +- 4 files changed, 24 insertions(+), 8 deletions(-) diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 14332f4cf8167..bf8143505c49d 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -127,6 +127,7 @@ enum perf_sw_ids { PERF_COUNT_SW_EMULATION_FAULTS = 8, PERF_COUNT_SW_DUMMY = 9, PERF_COUNT_SW_BPF_OUTPUT = 10, + PERF_COUNT_SW_CGROUP_SWITCHES = 11, PERF_COUNT_SW_MAX, /* non-ABI */ }; @@ -326,6 +327,7 @@ enum perf_event_read_format { #define PERF_ATTR_SIZE_VER4 104 /* add: sample_regs_intr */ #define PERF_ATTR_SIZE_VER5 112 /* add: aux_watermark */ #define PERF_ATTR_SIZE_VER6 120 /* add: aux_sample_size */ +#define PERF_ATTR_SIZE_VER7 128 /* add: sig_data */ /* * Hardware event_id to monitor via a performance monitoring event: @@ -404,7 +406,10 @@ struct perf_event_attr { cgroup : 1, /* include cgroup events */ text_poke : 1, /* include text poke events */ build_id : 1, /* use build id in mmap2 events */ - __reserved_1 : 29; + inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */ + remove_on_exec : 1, /* event is removed from task on exec */ + sigtrap : 1, /* send synchronous SIGTRAP on event */ + __reserved_1 : 26; union { __u32 wakeup_events; /* wakeup every n events */ @@ -456,6 +461,12 @@ struct perf_event_attr { __u16 __reserved_2; __u32 aux_sample_size; __u32 __reserved_3; + + /* + * User provided data if sigtrap=1, passed back to user via + * siginfo_t::si_perf, e.g. to permit user to identify the event. + */ + __u64 sig_data; }; /* @@ -1171,10 +1182,15 @@ enum perf_callchain_context { /** * PERF_RECORD_AUX::flags bits */ -#define PERF_AUX_FLAG_TRUNCATED 0x01 /* record was truncated to fit */ -#define PERF_AUX_FLAG_OVERWRITE 0x02 /* snapshot from overwrite mode */ -#define PERF_AUX_FLAG_PARTIAL 0x04 /* record contains gaps */ -#define PERF_AUX_FLAG_COLLISION 0x08 /* sample collided with another */ +#define PERF_AUX_FLAG_TRUNCATED 0x01 /* record was truncated to fit */ +#define PERF_AUX_FLAG_OVERWRITE 0x02 /* snapshot from overwrite mode */ +#define PERF_AUX_FLAG_PARTIAL 0x04 /* record contains gaps */ +#define PERF_AUX_FLAG_COLLISION 0x08 /* sample collided with another */ +#define PERF_AUX_FLAG_PMU_FORMAT_TYPE_MASK 0xff00 /* PMU specific trace format type */ + +/* CoreSight PMU AUX buffer formats */ +#define PERF_AUX_FLAG_CORESIGHT_FORMAT_CORESIGHT 0x0000 /* Default for backward compatibility */ +#define PERF_AUX_FLAG_CORESIGHT_FORMAT_RAW 0x0100 /* Raw format of the source */ #define PERF_FLAG_FD_NO_GROUP (1UL << 0) #define PERF_FLAG_FD_OUTPUT (1UL << 1) diff --git a/tools/perf/tests/attr/base-record b/tools/perf/tests/attr/base-record index 645009c08b3cb..4a7b8deef3fdd 100644 --- a/tools/perf/tests/attr/base-record +++ b/tools/perf/tests/attr/base-record @@ -5,7 +5,7 @@ group_fd=-1 flags=0|8 cpu=* type=0|1 -size=120 +size=128 config=0 sample_period=* sample_type=263 diff --git a/tools/perf/tests/attr/base-stat b/tools/perf/tests/attr/base-stat index b0f42c34882e8..4081644565306 100644 --- a/tools/perf/tests/attr/base-stat +++ b/tools/perf/tests/attr/base-stat @@ -5,7 +5,7 @@ group_fd=-1 flags=0|8 cpu=* type=0 -size=120 +size=128 config=0 sample_period=0 sample_type=65536 diff --git a/tools/perf/tests/attr/system-wide-dummy b/tools/perf/tests/attr/system-wide-dummy index eba723cc0d380..86a15dd359d93 100644 --- a/tools/perf/tests/attr/system-wide-dummy +++ b/tools/perf/tests/attr/system-wide-dummy @@ -7,7 +7,7 @@ cpu=* pid=-1 flags=8 type=1 -size=120 +size=128 config=9 sample_period=4000 sample_type=455 From 29038ae2ae566d9441e81cda3539db17c20bf06a Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 10 May 2021 14:02:17 +0200 Subject: [PATCH 093/263] Revert "Revert "ACPI: scan: Turn off unused power resources during initialization"" Revert commit 5db91e9cb5b3 ("Revert "ACPI: scan: Turn off unused power resources during initialization") which was not necessary. Signed-off-by: Rafael J. Wysocki --- drivers/acpi/internal.h | 1 + drivers/acpi/power.c | 2 +- drivers/acpi/scan.c | 2 ++ drivers/acpi/sleep.h | 1 - 4 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h index e6a5d997241c4..9fcefcdc1dbe0 100644 --- a/drivers/acpi/internal.h +++ b/drivers/acpi/internal.h @@ -139,6 +139,7 @@ int acpi_device_sleep_wake(struct acpi_device *dev, int acpi_power_get_inferred_state(struct acpi_device *device, int *state); int acpi_power_on_resources(struct acpi_device *device, int state); int acpi_power_transition(struct acpi_device *device, int state); +void acpi_turn_off_unused_power_resources(void); /* -------------------------------------------------------------------------- Device Power Management diff --git a/drivers/acpi/power.c b/drivers/acpi/power.c index 7e69931be828c..bacae6d178ff5 100644 --- a/drivers/acpi/power.c +++ b/drivers/acpi/power.c @@ -996,6 +996,7 @@ void acpi_resume_power_resources(void) mutex_unlock(&power_resource_list_lock); } +#endif void acpi_turn_off_unused_power_resources(void) { @@ -1016,4 +1017,3 @@ void acpi_turn_off_unused_power_resources(void) mutex_unlock(&power_resource_list_lock); } -#endif diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index a184529d8fa40..1584c9e463bdf 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -2360,6 +2360,8 @@ int __init acpi_scan_init(void) } } + acpi_turn_off_unused_power_resources(); + acpi_scan_initialized = true; out: diff --git a/drivers/acpi/sleep.h b/drivers/acpi/sleep.h index 1856f76ac83f7..7fe41ee489d61 100644 --- a/drivers/acpi/sleep.h +++ b/drivers/acpi/sleep.h @@ -8,7 +8,6 @@ extern struct list_head acpi_wakeup_device_list; extern struct mutex acpi_device_lock; extern void acpi_resume_power_resources(void); -extern void acpi_turn_off_unused_power_resources(void); static inline acpi_status acpi_set_waking_vector(u32 wakeup_address) { From 14b6cff54edaca5740068e9ed070152727ed7718 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 22 Apr 2021 17:26:19 +0200 Subject: [PATCH 094/263] staging: rtl8723bs: avoid bogus gcc warning gcc gets confused by some of the type casts and produces an apparently senseless warning about an out-of-bound memcpy to an unrelated array in the same structure: drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c: In function 'rtw_cfg80211_ap_set_encryption': cc1: error: writing 8 bytes into a region of size 0 [-Werror=stringop-overflow=] In file included from drivers/staging/rtl8723bs/include/drv_types.h:32, from drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c:10: drivers/staging/rtl8723bs/include/rtw_security.h:98:15: note: at offset [184, 4264] into destination object 'dot11AuthAlgrthm' of size 4 98 | u32 dot11AuthAlgrthm; /* 802.11 auth, could be open, shared, 8021x and authswitch */ | ^~~~~~~~~~~~~~~~ cc1: error: writing 8 bytes into a region of size 0 [-Werror=stringop-overflow=] drivers/staging/rtl8723bs/include/rtw_security.h:98:15: note: at offset [264, 4344] into destination object 'dot11AuthAlgrthm' of size 4 This is a known gcc bug, and the patch here is only a workaround, but the approach of using a temporary variable to hold a pointer to the key also improves readability in addition to avoiding the warning, so overall this should still help. Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99673 Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20210422152648.2891996-1-arnd@kernel.org Cc: stable Signed-off-by: Greg Kroah-Hartman --- .../staging/rtl8723bs/os_dep/ioctl_cfg80211.c | 23 +++++++++++-------- .../staging/rtl8723bs/os_dep/ioctl_linux.c | 21 +++++++++-------- 2 files changed, 25 insertions(+), 19 deletions(-) diff --git a/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c b/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c index c1dac6eec59f5..a6d731e959a28 100644 --- a/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c +++ b/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c @@ -527,6 +527,9 @@ static int rtw_cfg80211_ap_set_encryption(struct net_device *dev, struct ieee_pa struct mlme_priv *pmlmepriv = &padapter->mlmepriv; struct security_priv *psecuritypriv = &(padapter->securitypriv); struct sta_priv *pstapriv = &padapter->stapriv; + char *grpkey = padapter->securitypriv.dot118021XGrpKey[param->u.crypt.idx].skey; + char *txkey = padapter->securitypriv.dot118021XGrptxmickey[param->u.crypt.idx].skey; + char *rxkey = padapter->securitypriv.dot118021XGrprxmickey[param->u.crypt.idx].skey; param->u.crypt.err = 0; param->u.crypt.alg[IEEE_CRYPT_ALG_NAME_LEN - 1] = '\0'; @@ -609,7 +612,7 @@ static int rtw_cfg80211_ap_set_encryption(struct net_device *dev, struct ieee_pa { if (strcmp(param->u.crypt.alg, "WEP") == 0) { - memcpy(psecuritypriv->dot118021XGrpKey[param->u.crypt.idx].skey, param->u.crypt.key, (param->u.crypt.key_len > 16 ? 16 : param->u.crypt.key_len)); + memcpy(grpkey, param->u.crypt.key, (param->u.crypt.key_len > 16 ? 16 : param->u.crypt.key_len)); psecuritypriv->dot118021XGrpPrivacy = _WEP40_; if (param->u.crypt.key_len == 13) @@ -622,12 +625,12 @@ static int rtw_cfg80211_ap_set_encryption(struct net_device *dev, struct ieee_pa { psecuritypriv->dot118021XGrpPrivacy = _TKIP_; - memcpy(psecuritypriv->dot118021XGrpKey[param->u.crypt.idx].skey, param->u.crypt.key, (param->u.crypt.key_len > 16 ? 16 : param->u.crypt.key_len)); + memcpy(grpkey, param->u.crypt.key, (param->u.crypt.key_len > 16 ? 16 : param->u.crypt.key_len)); /* DEBUG_ERR("set key length :param->u.crypt.key_len =%d\n", param->u.crypt.key_len); */ /* set mic key */ - memcpy(psecuritypriv->dot118021XGrptxmickey[param->u.crypt.idx].skey, &(param->u.crypt.key[16]), 8); - memcpy(psecuritypriv->dot118021XGrprxmickey[param->u.crypt.idx].skey, &(param->u.crypt.key[24]), 8); + memcpy(txkey, &(param->u.crypt.key[16]), 8); + memcpy(rxkey, &(param->u.crypt.key[24]), 8); psecuritypriv->busetkipkey = true; @@ -636,7 +639,7 @@ static int rtw_cfg80211_ap_set_encryption(struct net_device *dev, struct ieee_pa { psecuritypriv->dot118021XGrpPrivacy = _AES_; - memcpy(psecuritypriv->dot118021XGrpKey[param->u.crypt.idx].skey, param->u.crypt.key, (param->u.crypt.key_len > 16 ? 16 : param->u.crypt.key_len)); + memcpy(grpkey, param->u.crypt.key, (param->u.crypt.key_len > 16 ? 16 : param->u.crypt.key_len)); } else { @@ -713,7 +716,7 @@ static int rtw_cfg80211_ap_set_encryption(struct net_device *dev, struct ieee_pa { if (strcmp(param->u.crypt.alg, "WEP") == 0) { - memcpy(psecuritypriv->dot118021XGrpKey[param->u.crypt.idx].skey, param->u.crypt.key, (param->u.crypt.key_len > 16 ? 16 : param->u.crypt.key_len)); + memcpy(grpkey, param->u.crypt.key, (param->u.crypt.key_len > 16 ? 16 : param->u.crypt.key_len)); psecuritypriv->dot118021XGrpPrivacy = _WEP40_; if (param->u.crypt.key_len == 13) @@ -725,12 +728,12 @@ static int rtw_cfg80211_ap_set_encryption(struct net_device *dev, struct ieee_pa { psecuritypriv->dot118021XGrpPrivacy = _TKIP_; - memcpy(psecuritypriv->dot118021XGrpKey[param->u.crypt.idx].skey, param->u.crypt.key, (param->u.crypt.key_len > 16 ? 16 : param->u.crypt.key_len)); + memcpy(grpkey, param->u.crypt.key, (param->u.crypt.key_len > 16 ? 16 : param->u.crypt.key_len)); /* DEBUG_ERR("set key length :param->u.crypt.key_len =%d\n", param->u.crypt.key_len); */ /* set mic key */ - memcpy(psecuritypriv->dot118021XGrptxmickey[param->u.crypt.idx].skey, &(param->u.crypt.key[16]), 8); - memcpy(psecuritypriv->dot118021XGrprxmickey[param->u.crypt.idx].skey, &(param->u.crypt.key[24]), 8); + memcpy(txkey, &(param->u.crypt.key[16]), 8); + memcpy(rxkey, &(param->u.crypt.key[24]), 8); psecuritypriv->busetkipkey = true; @@ -739,7 +742,7 @@ static int rtw_cfg80211_ap_set_encryption(struct net_device *dev, struct ieee_pa { psecuritypriv->dot118021XGrpPrivacy = _AES_; - memcpy(psecuritypriv->dot118021XGrpKey[param->u.crypt.idx].skey, param->u.crypt.key, (param->u.crypt.key_len > 16 ? 16 : param->u.crypt.key_len)); + memcpy(grpkey, param->u.crypt.key, (param->u.crypt.key_len > 16 ? 16 : param->u.crypt.key_len)); } else { diff --git a/drivers/staging/rtl8723bs/os_dep/ioctl_linux.c b/drivers/staging/rtl8723bs/os_dep/ioctl_linux.c index e98e5388d5c7b..5088c3731b6df 100644 --- a/drivers/staging/rtl8723bs/os_dep/ioctl_linux.c +++ b/drivers/staging/rtl8723bs/os_dep/ioctl_linux.c @@ -2963,6 +2963,9 @@ static int rtw_set_encryption(struct net_device *dev, struct ieee_param *param, struct mlme_priv *pmlmepriv = &padapter->mlmepriv; struct security_priv *psecuritypriv = &(padapter->securitypriv); struct sta_priv *pstapriv = &padapter->stapriv; + char *txkey = padapter->securitypriv.dot118021XGrptxmickey[param->u.crypt.idx].skey; + char *rxkey = padapter->securitypriv.dot118021XGrprxmickey[param->u.crypt.idx].skey; + char *grpkey = psecuritypriv->dot118021XGrpKey[param->u.crypt.idx].skey; param->u.crypt.err = 0; param->u.crypt.alg[IEEE_CRYPT_ALG_NAME_LEN - 1] = '\0'; @@ -3064,7 +3067,7 @@ static int rtw_set_encryption(struct net_device *dev, struct ieee_param *param, if (!psta && check_fwstate(pmlmepriv, WIFI_AP_STATE)) { /* group key */ if (param->u.crypt.set_tx == 1) { if (strcmp(param->u.crypt.alg, "WEP") == 0) { - memcpy(psecuritypriv->dot118021XGrpKey[param->u.crypt.idx].skey, param->u.crypt.key, (param->u.crypt.key_len > 16 ? 16 : param->u.crypt.key_len)); + memcpy(grpkey, param->u.crypt.key, (param->u.crypt.key_len > 16 ? 16 : param->u.crypt.key_len)); psecuritypriv->dot118021XGrpPrivacy = _WEP40_; if (param->u.crypt.key_len == 13) @@ -3073,11 +3076,11 @@ static int rtw_set_encryption(struct net_device *dev, struct ieee_param *param, } else if (strcmp(param->u.crypt.alg, "TKIP") == 0) { psecuritypriv->dot118021XGrpPrivacy = _TKIP_; - memcpy(psecuritypriv->dot118021XGrpKey[param->u.crypt.idx].skey, param->u.crypt.key, (param->u.crypt.key_len > 16 ? 16 : param->u.crypt.key_len)); + memcpy(grpkey, param->u.crypt.key, (param->u.crypt.key_len > 16 ? 16 : param->u.crypt.key_len)); /* DEBUG_ERR("set key length :param->u.crypt.key_len =%d\n", param->u.crypt.key_len); */ /* set mic key */ - memcpy(psecuritypriv->dot118021XGrptxmickey[param->u.crypt.idx].skey, &(param->u.crypt.key[16]), 8); + memcpy(txkey, &(param->u.crypt.key[16]), 8); memcpy(psecuritypriv->dot118021XGrprxmickey[param->u.crypt.idx].skey, &(param->u.crypt.key[24]), 8); psecuritypriv->busetkipkey = true; @@ -3086,7 +3089,7 @@ static int rtw_set_encryption(struct net_device *dev, struct ieee_param *param, else if (strcmp(param->u.crypt.alg, "CCMP") == 0) { psecuritypriv->dot118021XGrpPrivacy = _AES_; - memcpy(psecuritypriv->dot118021XGrpKey[param->u.crypt.idx].skey, param->u.crypt.key, (param->u.crypt.key_len > 16 ? 16 : param->u.crypt.key_len)); + memcpy(grpkey, param->u.crypt.key, (param->u.crypt.key_len > 16 ? 16 : param->u.crypt.key_len)); } else { psecuritypriv->dot118021XGrpPrivacy = _NO_PRIVACY_; } @@ -3142,7 +3145,7 @@ static int rtw_set_encryption(struct net_device *dev, struct ieee_param *param, } else { /* group key??? */ if (strcmp(param->u.crypt.alg, "WEP") == 0) { - memcpy(psecuritypriv->dot118021XGrpKey[param->u.crypt.idx].skey, param->u.crypt.key, (param->u.crypt.key_len > 16 ? 16 : param->u.crypt.key_len)); + memcpy(grpkey, param->u.crypt.key, (param->u.crypt.key_len > 16 ? 16 : param->u.crypt.key_len)); psecuritypriv->dot118021XGrpPrivacy = _WEP40_; if (param->u.crypt.key_len == 13) @@ -3150,19 +3153,19 @@ static int rtw_set_encryption(struct net_device *dev, struct ieee_param *param, } else if (strcmp(param->u.crypt.alg, "TKIP") == 0) { psecuritypriv->dot118021XGrpPrivacy = _TKIP_; - memcpy(psecuritypriv->dot118021XGrpKey[param->u.crypt.idx].skey, param->u.crypt.key, (param->u.crypt.key_len > 16 ? 16 : param->u.crypt.key_len)); + memcpy(grpkey, param->u.crypt.key, (param->u.crypt.key_len > 16 ? 16 : param->u.crypt.key_len)); /* DEBUG_ERR("set key length :param->u.crypt.key_len =%d\n", param->u.crypt.key_len); */ /* set mic key */ - memcpy(psecuritypriv->dot118021XGrptxmickey[param->u.crypt.idx].skey, &(param->u.crypt.key[16]), 8); - memcpy(psecuritypriv->dot118021XGrprxmickey[param->u.crypt.idx].skey, &(param->u.crypt.key[24]), 8); + memcpy(txkey, &(param->u.crypt.key[16]), 8); + memcpy(rxkey, &(param->u.crypt.key[24]), 8); psecuritypriv->busetkipkey = true; } else if (strcmp(param->u.crypt.alg, "CCMP") == 0) { psecuritypriv->dot118021XGrpPrivacy = _AES_; - memcpy(psecuritypriv->dot118021XGrpKey[param->u.crypt.idx].skey, param->u.crypt.key, (param->u.crypt.key_len > 16 ? 16 : param->u.crypt.key_len)); + memcpy(grpkey, param->u.crypt.key, (param->u.crypt.key_len > 16 ? 16 : param->u.crypt.key_len)); } else { psecuritypriv->dot118021XGrpPrivacy = _NO_PRIVACY_; } From 18abf874367456540846319574864e6ff32752e2 Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Mon, 26 Apr 2021 11:26:22 +0200 Subject: [PATCH 095/263] cdc-wdm: untangle a circular dependency between callback and softint We have a cycle of callbacks scheduling works which submit URBs with those callbacks. This needs to be blocked, stopped and unblocked to untangle the circle. Signed-off-by: Oliver Neukum Link: https://lore.kernel.org/r/20210426092622.20433-1-oneukum@suse.com Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/class/cdc-wdm.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/drivers/usb/class/cdc-wdm.c b/drivers/usb/class/cdc-wdm.c index 508b1c3f8b731..d1e4a7379bebd 100644 --- a/drivers/usb/class/cdc-wdm.c +++ b/drivers/usb/class/cdc-wdm.c @@ -321,12 +321,23 @@ static void wdm_int_callback(struct urb *urb) } -static void kill_urbs(struct wdm_device *desc) +static void poison_urbs(struct wdm_device *desc) { /* the order here is essential */ - usb_kill_urb(desc->command); - usb_kill_urb(desc->validity); - usb_kill_urb(desc->response); + usb_poison_urb(desc->command); + usb_poison_urb(desc->validity); + usb_poison_urb(desc->response); +} + +static void unpoison_urbs(struct wdm_device *desc) +{ + /* + * the order here is not essential + * it is symmetrical just to be nice + */ + usb_unpoison_urb(desc->response); + usb_unpoison_urb(desc->validity); + usb_unpoison_urb(desc->command); } static void free_urbs(struct wdm_device *desc) @@ -741,11 +752,12 @@ static int wdm_release(struct inode *inode, struct file *file) if (!desc->count) { if (!test_bit(WDM_DISCONNECTING, &desc->flags)) { dev_dbg(&desc->intf->dev, "wdm_release: cleanup\n"); - kill_urbs(desc); + poison_urbs(desc); spin_lock_irq(&desc->iuspin); desc->resp_count = 0; spin_unlock_irq(&desc->iuspin); desc->manage_power(desc->intf, 0); + unpoison_urbs(desc); } else { /* must avoid dev_printk here as desc->intf is invalid */ pr_debug(KBUILD_MODNAME " %s: device gone - cleaning up\n", __func__); @@ -1037,9 +1049,9 @@ static void wdm_disconnect(struct usb_interface *intf) wake_up_all(&desc->wait); mutex_lock(&desc->rlock); mutex_lock(&desc->wlock); + poison_urbs(desc); cancel_work_sync(&desc->rxwork); cancel_work_sync(&desc->service_outs_intr); - kill_urbs(desc); mutex_unlock(&desc->wlock); mutex_unlock(&desc->rlock); @@ -1080,9 +1092,10 @@ static int wdm_suspend(struct usb_interface *intf, pm_message_t message) set_bit(WDM_SUSPENDING, &desc->flags); spin_unlock_irq(&desc->iuspin); /* callback submits work - order is essential */ - kill_urbs(desc); + poison_urbs(desc); cancel_work_sync(&desc->rxwork); cancel_work_sync(&desc->service_outs_intr); + unpoison_urbs(desc); } if (!PMSG_IS_AUTO(message)) { mutex_unlock(&desc->wlock); @@ -1140,7 +1153,7 @@ static int wdm_pre_reset(struct usb_interface *intf) wake_up_all(&desc->wait); mutex_lock(&desc->rlock); mutex_lock(&desc->wlock); - kill_urbs(desc); + poison_urbs(desc); cancel_work_sync(&desc->rxwork); cancel_work_sync(&desc->service_outs_intr); return 0; @@ -1151,6 +1164,7 @@ static int wdm_post_reset(struct usb_interface *intf) struct wdm_device *desc = wdm_find_device(intf); int rv; + unpoison_urbs(desc); clear_bit(WDM_OVERFLOW, &desc->flags); clear_bit(WDM_RESETTING, &desc->flags); rv = recover_from_urb_loss(desc); From 04357fafea9c7ed34525eb9680c760245c3bb958 Mon Sep 17 00:00:00 2001 From: Ferry Toth Date: Sun, 25 Apr 2021 17:09:47 +0200 Subject: [PATCH 096/263] usb: dwc3: pci: Enable usb2-gadget-lpm-disable for Intel Merrifield On Intel Merrifield LPM is causing host to reset port after a timeout. By disabling LPM entirely this is prevented. Fixes: 066c09593454 ("usb: dwc3: pci: Enable extcon driver for Intel Merrifield") Reviewed-by: Andy Shevchenko Signed-off-by: Ferry Toth Cc: stable Link: https://lore.kernel.org/r/20210425150947.5862-1-ftoth@exalondelft.nl Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/dwc3-pci.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/dwc3/dwc3-pci.c b/drivers/usb/dwc3/dwc3-pci.c index e7b932dcbf820..1e51460938b83 100644 --- a/drivers/usb/dwc3/dwc3-pci.c +++ b/drivers/usb/dwc3/dwc3-pci.c @@ -123,6 +123,7 @@ static const struct property_entry dwc3_pci_mrfld_properties[] = { PROPERTY_ENTRY_STRING("linux,extcon-name", "mrfld_bcove_pwrsrc"), PROPERTY_ENTRY_BOOL("snps,dis_u3_susphy_quirk"), PROPERTY_ENTRY_BOOL("snps,dis_u2_susphy_quirk"), + PROPERTY_ENTRY_BOOL("snps,usb2-gadget-lpm-disable"), PROPERTY_ENTRY_BOOL("linux,sysdev_is_parent"), {} }; From 9cbc7eb17cdf6d1adaa2aebfe0079077d31d39a9 Mon Sep 17 00:00:00 2001 From: Thinh Nguyen Date: Mon, 26 Apr 2021 14:08:40 -0700 Subject: [PATCH 097/263] usb: dwc3: core: Add missing GHWPARAMS9 doc Add missing documentation for struct dwc3_hwparams new field hwparams9 to avoid kernel doc build warning. Fixes: 16710380d3aa ("usb: dwc3: Capture new capability register GHWPARAMS9") Reported-by: Stephen Rothwell Acked-by: Felipe Balbi Signed-off-by: Thinh Nguyen Link: https://lore.kernel.org/r/f4c491f7614e623755fafe640b7e690e7c5634e2.1619471127.git.Thinh.Nguyen@synopsys.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/core.h | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/dwc3/core.h b/drivers/usb/dwc3/core.h index b1e875c58f20f..3859d8cad3cb4 100644 --- a/drivers/usb/dwc3/core.h +++ b/drivers/usb/dwc3/core.h @@ -850,6 +850,7 @@ struct dwc3_trb { * @hwparams6: GHWPARAMS6 * @hwparams7: GHWPARAMS7 * @hwparams8: GHWPARAMS8 + * @hwparams9: GHWPARAMS9 */ struct dwc3_hwparams { u32 hwparams0; From 6c05cdbb9ef1de0264cac9135f6e90dad1e8763f Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Sun, 25 Apr 2021 12:32:53 -0300 Subject: [PATCH 098/263] usb: Restore the reference to ch9.h Keep the textual reference to ch9.h as it was prior to commit caa93d9bd2d7 ("usb: Fix up movement of USB core kerneldoc location"). As linux/usb/ch9.h does not contain comments anymore, explain that drivers/usb/common/common.c includes such header and provides declarations of a few utilities routines for manipulating the data types from ch9.h. Also mention that drivers/usb/common/debug.c contains some functions for creating debug output. Fixes: caa93d9bd2d7 ("usb: Fix up movement of USB core kerneldoc location") Reported-by: Alan Stern Suggested-by: Alan Stern Acked-by: Alan Stern Signed-off-by: Fabio Estevam Link: https://lore.kernel.org/r/20210425153253.2542816-1-festevam@gmail.com Signed-off-by: Greg Kroah-Hartman --- Documentation/driver-api/usb/usb.rst | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/Documentation/driver-api/usb/usb.rst b/Documentation/driver-api/usb/usb.rst index 543e70434da22..820e867af45ab 100644 --- a/Documentation/driver-api/usb/usb.rst +++ b/Documentation/driver-api/usb/usb.rst @@ -109,16 +109,19 @@ well as to make sure they aren't relying on some HCD-specific behavior. USB-Standard Types ================== -In ``drivers/usb/common/common.c`` and ``drivers/usb/common/debug.c`` you -will find the USB data types defined in chapter 9 of the USB specification. -These data types are used throughout USB, and in APIs including this host -side API, gadget APIs, usb character devices and debugfs interfaces. +In ``include/uapi/linux/usb/ch9.h`` you will find the USB data types defined +in chapter 9 of the USB specification. These data types are used throughout +USB, and in APIs including this host side API, gadget APIs, usb character +devices and debugfs interfaces. That file is itself included by +``include/linux/usb/ch9.h``, which also contains declarations of a few +utility routines for manipulating these data types; the implementations +are in ``drivers/usb/common/common.c``. .. kernel-doc:: drivers/usb/common/common.c :export: -.. kernel-doc:: drivers/usb/common/debug.c - :export: +In addition, some functions useful for creating debugging output are +defined in ``drivers/usb/common/debug.c``. Host-Side Data Types and Macros =============================== From d1d90dd27254c44d087ad3f8b5b3e4fff0571f45 Mon Sep 17 00:00:00 2001 From: Jack Pham Date: Wed, 28 Apr 2021 02:01:10 -0700 Subject: [PATCH 099/263] usb: dwc3: gadget: Enable suspend events commit 72704f876f50 ("dwc3: gadget: Implement the suspend entry event handler") introduced (nearly 5 years ago!) an interrupt handler for U3/L1-L2 suspend events. The problem is that these events aren't currently enabled in the DEVTEN register so the handler is never even invoked. Fix this simply by enabling the corresponding bit in dwc3_gadget_enable_irq() using the same revision check as found in the handler. Fixes: 72704f876f50 ("dwc3: gadget: Implement the suspend entry event handler") Acked-by: Felipe Balbi Signed-off-by: Jack Pham Cc: stable Link: https://lore.kernel.org/r/20210428090111.3370-1-jackp@codeaurora.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/gadget.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c index dd80e5ca8c78b..cab3a91840689 100644 --- a/drivers/usb/dwc3/gadget.c +++ b/drivers/usb/dwc3/gadget.c @@ -2323,6 +2323,10 @@ static void dwc3_gadget_enable_irq(struct dwc3 *dwc) if (DWC3_VER_IS_PRIOR(DWC3, 250A)) reg |= DWC3_DEVTEN_ULSTCNGEN; + /* On 2.30a and above this bit enables U3/L2-L1 Suspend Events */ + if (!DWC3_VER_IS_PRIOR(DWC3, 230A)) + reg |= DWC3_DEVTEN_EOPFEN; + dwc3_writel(dwc->regs, DWC3_DEVTEN, reg); } From 6f26ebb79a84bcad211cb2d8a2ef74dfc427322d Mon Sep 17 00:00:00 2001 From: Jack Pham Date: Wed, 28 Apr 2021 02:01:11 -0700 Subject: [PATCH 100/263] usb: dwc3: gadget: Rename EOPF event macros to Suspend The device event corresponding to End of Periodic Frame is only found on older IP revisions (2.10a and prior, according to a cursory SNPS databook search). On revisions 2.30a and newer, including DWC3.1, the same event value and corresponding DEVTEN bit were repurposed to indicate that the link has gone into suspend state (U3 or L2/L1). EOPF events had never been enabled before in this driver, and going forward we expect current and future DWC3-based devices won't likely to be using such old DWC3 IP revisions either. Hence rather than keeping the deprecated EOPF macro names let's rename them to indicate their usage for suspend events. Acked-by: Felipe Balbi Signed-off-by: Jack Pham Link: https://lore.kernel.org/r/20210428090111.3370-2-jackp@codeaurora.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/core.h | 6 +++--- drivers/usb/dwc3/debug.h | 8 ++++---- drivers/usb/dwc3/gadget.c | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/usb/dwc3/core.h b/drivers/usb/dwc3/core.h index 3859d8cad3cb4..c5d5760cdf53e 100644 --- a/drivers/usb/dwc3/core.h +++ b/drivers/usb/dwc3/core.h @@ -57,7 +57,7 @@ #define DWC3_DEVICE_EVENT_LINK_STATUS_CHANGE 3 #define DWC3_DEVICE_EVENT_WAKEUP 4 #define DWC3_DEVICE_EVENT_HIBER_REQ 5 -#define DWC3_DEVICE_EVENT_EOPF 6 +#define DWC3_DEVICE_EVENT_SUSPEND 6 #define DWC3_DEVICE_EVENT_SOF 7 #define DWC3_DEVICE_EVENT_ERRATIC_ERROR 9 #define DWC3_DEVICE_EVENT_CMD_CMPL 10 @@ -460,7 +460,7 @@ #define DWC3_DEVTEN_CMDCMPLTEN BIT(10) #define DWC3_DEVTEN_ERRTICERREN BIT(9) #define DWC3_DEVTEN_SOFEN BIT(7) -#define DWC3_DEVTEN_EOPFEN BIT(6) +#define DWC3_DEVTEN_U3L2L1SUSPEN BIT(6) #define DWC3_DEVTEN_HIBERNATIONREQEVTEN BIT(5) #define DWC3_DEVTEN_WKUPEVTEN BIT(4) #define DWC3_DEVTEN_ULSTCNGEN BIT(3) @@ -1375,7 +1375,7 @@ struct dwc3_event_depevt { * 3 - ULStChng * 4 - WkUpEvt * 5 - Reserved - * 6 - EOPF + * 6 - Suspend (EOPF on revisions 2.10a and prior) * 7 - SOF * 8 - Reserved * 9 - ErrticErr diff --git a/drivers/usb/dwc3/debug.h b/drivers/usb/dwc3/debug.h index db231de46bb35..d0ac89c5b3172 100644 --- a/drivers/usb/dwc3/debug.h +++ b/drivers/usb/dwc3/debug.h @@ -221,8 +221,8 @@ static inline const char *dwc3_gadget_event_string(char *str, size_t size, snprintf(str, size, "WakeUp [%s]", dwc3_gadget_link_string(state)); break; - case DWC3_DEVICE_EVENT_EOPF: - snprintf(str, size, "End-Of-Frame [%s]", + case DWC3_DEVICE_EVENT_SUSPEND: + snprintf(str, size, "Suspend [%s]", dwc3_gadget_link_string(state)); break; case DWC3_DEVICE_EVENT_SOF: @@ -353,8 +353,8 @@ static inline const char *dwc3_gadget_event_type_string(u8 event) return "Wake-Up"; case DWC3_DEVICE_EVENT_HIBER_REQ: return "Hibernation"; - case DWC3_DEVICE_EVENT_EOPF: - return "End of Periodic Frame"; + case DWC3_DEVICE_EVENT_SUSPEND: + return "Suspend"; case DWC3_DEVICE_EVENT_SOF: return "Start of Frame"; case DWC3_DEVICE_EVENT_ERRATIC_ERROR: diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c index cab3a91840689..6eab78f8a1a78 100644 --- a/drivers/usb/dwc3/gadget.c +++ b/drivers/usb/dwc3/gadget.c @@ -2325,7 +2325,7 @@ static void dwc3_gadget_enable_irq(struct dwc3 *dwc) /* On 2.30a and above this bit enables U3/L2-L1 Suspend Events */ if (!DWC3_VER_IS_PRIOR(DWC3, 230A)) - reg |= DWC3_DEVTEN_EOPFEN; + reg |= DWC3_DEVTEN_U3L2L1SUSPEN; dwc3_writel(dwc->regs, DWC3_DEVTEN, reg); } @@ -3744,7 +3744,7 @@ static void dwc3_gadget_interrupt(struct dwc3 *dwc, case DWC3_DEVICE_EVENT_LINK_STATUS_CHANGE: dwc3_gadget_linksts_change_interrupt(dwc, event->event_info); break; - case DWC3_DEVICE_EVENT_EOPF: + case DWC3_DEVICE_EVENT_SUSPEND: /* It changed to be suspend event for version 2.30a and above */ if (!DWC3_VER_IS_PRIOR(DWC3, 230A)) { /* From 75a41ce46bae6cbe7d3bb2584eb844291d642874 Mon Sep 17 00:00:00 2001 From: Phil Elwell Date: Thu, 6 May 2021 12:22:00 +0100 Subject: [PATCH 101/263] usb: dwc2: Fix gadget DMA unmap direction The dwc2 gadget support maps and unmaps DMA buffers as necessary. When mapping and unmapping it uses the direction of the endpoint to select the direction of the DMA transfer, but this fails for Control OUT transfers because the unmap occurs after the endpoint direction has been reversed for the status phase. A possible solution would be to unmap the buffer before the direction is changed, but a safer, less invasive fix is to remember the buffer direction independently of the endpoint direction. Fixes: fe0b94abcdf6 ("usb: dwc2: gadget: manage ep0 state in software") Acked-by: Minas Harutyunyan Cc: stable Signed-off-by: Phil Elwell Link: https://lore.kernel.org/r/20210506112200.2893922-1-phil@raspberrypi.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc2/core.h | 2 ++ drivers/usb/dwc2/gadget.c | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/usb/dwc2/core.h b/drivers/usb/dwc2/core.h index da5ac4a4595b6..ab6b815e0089c 100644 --- a/drivers/usb/dwc2/core.h +++ b/drivers/usb/dwc2/core.h @@ -113,6 +113,7 @@ struct dwc2_hsotg_req; * @debugfs: File entry for debugfs file for this endpoint. * @dir_in: Set to true if this endpoint is of the IN direction, which * means that it is sending data to the Host. + * @map_dir: Set to the value of dir_in when the DMA buffer is mapped. * @index: The index for the endpoint registers. * @mc: Multi Count - number of transactions per microframe * @interval: Interval for periodic endpoints, in frames or microframes. @@ -162,6 +163,7 @@ struct dwc2_hsotg_ep { unsigned short fifo_index; unsigned char dir_in; + unsigned char map_dir; unsigned char index; unsigned char mc; u16 interval; diff --git a/drivers/usb/dwc2/gadget.c b/drivers/usb/dwc2/gadget.c index e6bb1bdb27603..184964174dc0c 100644 --- a/drivers/usb/dwc2/gadget.c +++ b/drivers/usb/dwc2/gadget.c @@ -422,7 +422,7 @@ static void dwc2_hsotg_unmap_dma(struct dwc2_hsotg *hsotg, { struct usb_request *req = &hs_req->req; - usb_gadget_unmap_request(&hsotg->gadget, req, hs_ep->dir_in); + usb_gadget_unmap_request(&hsotg->gadget, req, hs_ep->map_dir); } /* @@ -1242,6 +1242,7 @@ static int dwc2_hsotg_map_dma(struct dwc2_hsotg *hsotg, { int ret; + hs_ep->map_dir = hs_ep->dir_in; ret = usb_gadget_map_request(&hsotg->gadget, req, hs_ep->dir_in); if (ret) goto dma_error; From bb9c74a5bd1462499fe5ccb1e3c5ac40dcfa9139 Mon Sep 17 00:00:00 2001 From: Jack Pham Date: Sat, 1 May 2021 02:35:58 -0700 Subject: [PATCH 102/263] usb: dwc3: gadget: Free gadget structure only after freeing endpoints As part of commit e81a7018d93a ("usb: dwc3: allocate gadget structure dynamically") the dwc3_gadget_release() was added which will free the dwc->gadget structure upon the device's removal when usb_del_gadget_udc() is called in dwc3_gadget_exit(). However, simply freeing the gadget results a dangling pointer situation: the endpoints created in dwc3_gadget_init_endpoints() have their dep->endpoint.ep_list members chained off the list_head anchored at dwc->gadget->ep_list. Thus when dwc->gadget is freed, the first dwc3_ep in the list now has a dangling prev pointer and likewise for the next pointer of the dwc3_ep at the tail of the list. The dwc3_gadget_free_endpoints() that follows will result in a use-after-free when it calls list_del(). This was caught by enabling KASAN and performing a driver unbind. The recent commit 568262bf5492 ("usb: dwc3: core: Add shutdown callback for dwc3") also exposes this as a panic during shutdown. There are a few possibilities to fix this. One could be to perform a list_del() of the gadget->ep_list itself which removes it from the rest of the dwc3_ep chain. Another approach is what this patch does, by splitting up the usb_del_gadget_udc() call into its separate "del" and "put" components. This allows dwc3_gadget_free_endpoints() to be called before the gadget is finally freed with usb_put_gadget(). Fixes: e81a7018d93a ("usb: dwc3: allocate gadget structure dynamically") Reviewed-by: Peter Chen Signed-off-by: Jack Pham Link: https://lore.kernel.org/r/20210501093558.7375-1-jackp@codeaurora.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/gadget.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c index 6eab78f8a1a78..dd1342403bb24 100644 --- a/drivers/usb/dwc3/gadget.c +++ b/drivers/usb/dwc3/gadget.c @@ -4062,8 +4062,9 @@ int dwc3_gadget_init(struct dwc3 *dwc) void dwc3_gadget_exit(struct dwc3 *dwc) { - usb_del_gadget_udc(dwc->gadget); + usb_del_gadget(dwc->gadget); dwc3_gadget_free_endpoints(dwc); + usb_put_gadget(dwc->gadget); dma_free_coherent(dwc->sysdev, DWC3_BOUNCE_SIZE, dwc->bounce, dwc->bounce_addr); kfree(dwc->setup_buf); From 18ffa988dbae69cc6e9949cddd9606f6fe533894 Mon Sep 17 00:00:00 2001 From: Wesley Cheng Date: Fri, 7 May 2021 10:55:19 -0700 Subject: [PATCH 103/263] usb: dwc3: gadget: Return success always for kick transfer in ep queue If an error is received when issuing a start or update transfer command, the error handler will stop all active requests (including the current USB request), and call dwc3_gadget_giveback() to notify function drivers of the requests which have been stopped. Avoid returning an error for kick transfer during EP queue, to remove duplicate cleanup operations on the request being queued. Fixes: 8d99087c2db8 ("usb: dwc3: gadget: Properly handle failed kick_transfer") cc: stable@vger.kernel.org Signed-off-by: Wesley Cheng Link: https://lore.kernel.org/r/1620410119-24971-1-git-send-email-wcheng@codeaurora.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/gadget.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c index dd1342403bb24..49ca5da5e2794 100644 --- a/drivers/usb/dwc3/gadget.c +++ b/drivers/usb/dwc3/gadget.c @@ -1684,7 +1684,9 @@ static int __dwc3_gadget_ep_queue(struct dwc3_ep *dep, struct dwc3_request *req) } } - return __dwc3_gadget_kick_transfer(dep); + __dwc3_gadget_kick_transfer(dep); + + return 0; } static int dwc3_gadget_ep_queue(struct usb_ep *ep, struct usb_request *request, From b96992081fde19806b5beb5b25f9327820ead77b Mon Sep 17 00:00:00 2001 From: Li Jun Date: Fri, 30 Apr 2021 14:57:16 +0800 Subject: [PATCH 104/263] usb: dwc3: imx8mp: detect dwc3 core node via compatible string New schema of usb controller DT-node should be named with prefix "^usb(@.*)?", dt changed the node name, but missed counter part change in driver, fix it by switching to use compatible string as the dwc3 core compatible string keeps "snps,dwc3" in all dt. Fixes: d1689cd3c0f4 ("arm64: dts: imx8mp: Use the correct name for child node "snps, dwc3"") Acked-by: Felipe Balbi Signed-off-by: Li Jun Link: https://lore.kernel.org/r/1619765836-20387-1-git-send-email-jun.li@nxp.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/dwc3-imx8mp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/dwc3/dwc3-imx8mp.c b/drivers/usb/dwc3/dwc3-imx8mp.c index b13cfab89d532..e9fced6f7a7c9 100644 --- a/drivers/usb/dwc3/dwc3-imx8mp.c +++ b/drivers/usb/dwc3/dwc3-imx8mp.c @@ -165,7 +165,7 @@ static int dwc3_imx8mp_probe(struct platform_device *pdev) if (err < 0) goto disable_rpm; - dwc3_np = of_get_child_by_name(node, "dwc3"); + dwc3_np = of_get_compatible_child(node, "snps,dwc3"); if (!dwc3_np) { dev_err(dev, "failed to find dwc3 core child\n"); goto disable_rpm; From 0b2b149e918f6dddb4ea53615551bf7bc131f875 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Sat, 8 May 2021 09:53:10 +0800 Subject: [PATCH 105/263] usb: dwc3: imx8mp: fix error return code in dwc3_imx8mp_probe() Fix to return a negative error code from the error handling case instead of 0, as done elsewhere in this function. Fixes: 6dd2565989b4 ("usb: dwc3: add imx8mp dwc3 glue layer driver") Reported-by: Hulk Robot Acked-by: Felipe Balbi Signed-off-by: Zhen Lei Cc: stable Link: https://lore.kernel.org/r/20210508015310.1627-1-thunder.leizhen@huawei.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/dwc3-imx8mp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/dwc3/dwc3-imx8mp.c b/drivers/usb/dwc3/dwc3-imx8mp.c index e9fced6f7a7c9..756faa46d33a7 100644 --- a/drivers/usb/dwc3/dwc3-imx8mp.c +++ b/drivers/usb/dwc3/dwc3-imx8mp.c @@ -167,6 +167,7 @@ static int dwc3_imx8mp_probe(struct platform_device *pdev) dwc3_np = of_get_compatible_child(node, "snps,dwc3"); if (!dwc3_np) { + err = -ENODEV; dev_err(dev, "failed to find dwc3 core child\n"); goto disable_rpm; } From e89baeba4f64bab679618b3330cdcda5929fb8d5 Mon Sep 17 00:00:00 2001 From: Matthijs Kooijman Date: Mon, 3 May 2021 20:05:38 +0200 Subject: [PATCH 106/263] usb: dwc2: Remove obsolete MODULE_ constants from platform.c Originally, the core and platform drivers were separate modules, so each had its own module info. Since commit 2d1165a4b95e (usb: dwc2: remove dwc2_platform.ko) platform.c is included in the core module, which now contains duplicate module info (from core.c and platform.c). Due to the linking order and modinfo implementation, running `modinfo` on the resulting dwc2.ko shows just the info from platform.c, rather than that from core.c, suggesting that I am the author of the entire dwc2 module. Since platform.c is just a minor part of the entire module, this removes its module info in favor of the info from core.c. Acked-by: Minas Harutyunyan Signed-off-by: Matthijs Kooijman Link: https://lore.kernel.org/r/20210503180538.64423-1-matthijs@stdin.nl Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc2/platform.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/usb/dwc2/platform.c b/drivers/usb/dwc2/platform.c index 3024785d84cb8..520a0beef77ca 100644 --- a/drivers/usb/dwc2/platform.c +++ b/drivers/usb/dwc2/platform.c @@ -776,7 +776,3 @@ static struct platform_driver dwc2_platform_driver = { }; module_platform_driver(dwc2_platform_driver); - -MODULE_DESCRIPTION("DESIGNWARE HS OTG Platform Glue"); -MODULE_AUTHOR("Matthijs Kooijman "); -MODULE_LICENSE("Dual BSD/GPL"); From 2e2b8d15adc2f6ab2d4aa0550e241b9742a436a0 Mon Sep 17 00:00:00 2001 From: Kyle Tso Date: Tue, 4 May 2021 01:18:49 +0800 Subject: [PATCH 107/263] usb: typec: tcpm: Fix wrong handling in GET_SINK_CAP After receiving Sink Capabilities Message in GET_SINK_CAP AMS, it is incorrect to call tcpm_pd_handle_state because the Message is expected and the current state is not Ready states. The result of this incorrect operation ends in Soft Reset which is definitely wrong. Simply forwarding to Ready States is enough to finish the AMS. Fixes: 8dea75e11380 ("usb: typec: tcpm: Protocol Error handling") Reviewed-by: Guenter Roeck Acked-by: Heikki Krogerus Signed-off-by: Kyle Tso Cc: stable Link: https://lore.kernel.org/r/20210503171849.2605302-1-kyletso@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/tcpm/tcpm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c index c4fdc00a3bc8f..68e04e397e924 100644 --- a/drivers/usb/typec/tcpm/tcpm.c +++ b/drivers/usb/typec/tcpm/tcpm.c @@ -2390,7 +2390,7 @@ static void tcpm_pd_data_request(struct tcpm_port *port, port->nr_sink_caps = cnt; port->sink_cap_done = true; if (port->ams == GET_SINK_CAPABILITIES) - tcpm_pd_handle_state(port, ready_state(port), NONE_AMS, 0); + tcpm_set_state(port, ready_state(port), 0); /* Unexpected Sink Capabilities */ else tcpm_pd_handle_msg(port, From 8edb79af88efc6e49e735f9baf61d9f0748b881f Mon Sep 17 00:00:00 2001 From: Dinghao Liu Date: Wed, 7 Apr 2021 11:49:27 +0800 Subject: [PATCH 108/263] iio: light: gp2ap002: Fix rumtime PM imbalance on error When devm_request_threaded_irq() fails, we should decrease the runtime PM counter to keep the counter balanced. But when iio_device_register() fails, we need not to decrease it because we have already decreased it before. Signed-off-by: Dinghao Liu Reviewed-by: Linus Walleij Fixes: 97d642e23037 ("iio: light: Add a driver for Sharp GP2AP002x00F") Link: https://lore.kernel.org/r/20210407034927.16882-1-dinghao.liu@zju.edu.cn Signed-off-by: Jonathan Cameron --- drivers/iio/light/gp2ap002.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/iio/light/gp2ap002.c b/drivers/iio/light/gp2ap002.c index d048ae257c519..f960be7d40016 100644 --- a/drivers/iio/light/gp2ap002.c +++ b/drivers/iio/light/gp2ap002.c @@ -582,7 +582,7 @@ static int gp2ap002_probe(struct i2c_client *client, "gp2ap002", indio_dev); if (ret) { dev_err(dev, "unable to request IRQ\n"); - goto out_disable_vio; + goto out_put_pm; } gp2ap002->irq = client->irq; @@ -612,8 +612,9 @@ static int gp2ap002_probe(struct i2c_client *client, return 0; -out_disable_pm: +out_put_pm: pm_runtime_put_noidle(dev); +out_disable_pm: pm_runtime_disable(dev); out_disable_vio: regulator_disable(gp2ap002->vio); From a2fa9242e89f27696515699fe0f0296bf1ac1815 Mon Sep 17 00:00:00 2001 From: Dinghao Liu Date: Mon, 12 Apr 2021 13:32:02 +0800 Subject: [PATCH 109/263] iio: proximity: pulsedlight: Fix rumtime PM imbalance on error When lidar_write_control() fails, a pairing PM usage counter decrement is needed to keep the counter balanced. Fixes: 4ac4e086fd8c5 ("iio: pulsedlight-lidar-lite: add runtime PM") Signed-off-by: Dinghao Liu Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20210412053204.4889-1-dinghao.liu@zju.edu.cn Signed-off-by: Jonathan Cameron --- drivers/iio/proximity/pulsedlight-lidar-lite-v2.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iio/proximity/pulsedlight-lidar-lite-v2.c b/drivers/iio/proximity/pulsedlight-lidar-lite-v2.c index c685f10b5ae48..cc206bfa09c78 100644 --- a/drivers/iio/proximity/pulsedlight-lidar-lite-v2.c +++ b/drivers/iio/proximity/pulsedlight-lidar-lite-v2.c @@ -160,6 +160,7 @@ static int lidar_get_measurement(struct lidar_data *data, u16 *reg) ret = lidar_write_control(data, LIDAR_REG_CONTROL_ACQUIRE); if (ret < 0) { dev_err(&client->dev, "cannot send start measurement command"); + pm_runtime_put_noidle(&client->dev); return ret; } From 7061803522ee7876df1ca18cdd1e1551f761352d Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Wed, 14 Apr 2021 11:49:55 +0300 Subject: [PATCH 110/263] iio: hid-sensors: select IIO_TRIGGERED_BUFFER under HID_SENSOR_IIO_TRIGGER During commit 067fda1c065ff ("iio: hid-sensors: move triggered buffer setup into hid_sensor_setup_trigger"), the iio_triggered_buffer_{setup,cleanup}() functions got moved under the hid-sensor-trigger module. The above change works fine, if any of the sensors get built. However, when only the common hid-sensor-trigger module gets built (and none of the drivers), then the IIO_TRIGGERED_BUFFER symbol isn't selected/enforced. Previously, each driver would enforce/select the IIO_TRIGGERED_BUFFER symbol. With this change the HID_SENSOR_IIO_TRIGGER (for the hid-sensor-trigger module) will enforce that IIO_TRIGGERED_BUFFER gets selected. All HID sensor drivers select the HID_SENSOR_IIO_TRIGGER symbol. So, this change removes the IIO_TRIGGERED_BUFFER enforcement from each driver. Fixes: 067fda1c065ff ("iio: hid-sensors: move triggered buffer setup into hid_sensor_setup_trigger") Reported-by: Thomas Deutschmann Cc: Srinivas Pandruvada Signed-off-by: Alexandru Ardelean Acked-by: Srinivas Pandruvada Link: https://lore.kernel.org/r/20210414084955.260117-1-aardelean@deviqon.com Signed-off-by: Jonathan Cameron --- drivers/iio/accel/Kconfig | 1 - drivers/iio/common/hid-sensors/Kconfig | 1 + drivers/iio/gyro/Kconfig | 1 - drivers/iio/humidity/Kconfig | 1 - drivers/iio/light/Kconfig | 2 -- drivers/iio/magnetometer/Kconfig | 1 - drivers/iio/orientation/Kconfig | 2 -- drivers/iio/pressure/Kconfig | 1 - drivers/iio/temperature/Kconfig | 1 - 9 files changed, 1 insertion(+), 10 deletions(-) diff --git a/drivers/iio/accel/Kconfig b/drivers/iio/accel/Kconfig index cceda3cecbcf4..8b1723635cce5 100644 --- a/drivers/iio/accel/Kconfig +++ b/drivers/iio/accel/Kconfig @@ -229,7 +229,6 @@ config DMARD10 config HID_SENSOR_ACCEL_3D depends on HID_SENSOR_HUB select IIO_BUFFER - select IIO_TRIGGERED_BUFFER select HID_SENSOR_IIO_COMMON select HID_SENSOR_IIO_TRIGGER tristate "HID Accelerometers 3D" diff --git a/drivers/iio/common/hid-sensors/Kconfig b/drivers/iio/common/hid-sensors/Kconfig index 24d4925673363..2a3dd3b907bee 100644 --- a/drivers/iio/common/hid-sensors/Kconfig +++ b/drivers/iio/common/hid-sensors/Kconfig @@ -19,6 +19,7 @@ config HID_SENSOR_IIO_TRIGGER tristate "Common module (trigger) for all HID Sensor IIO drivers" depends on HID_SENSOR_HUB && HID_SENSOR_IIO_COMMON && IIO_BUFFER select IIO_TRIGGER + select IIO_TRIGGERED_BUFFER help Say yes here to build trigger support for HID sensors. Triggers will be send if all requested attributes were read. diff --git a/drivers/iio/gyro/Kconfig b/drivers/iio/gyro/Kconfig index 5824f2edf9758..20b5ac7ab66af 100644 --- a/drivers/iio/gyro/Kconfig +++ b/drivers/iio/gyro/Kconfig @@ -111,7 +111,6 @@ config FXAS21002C_SPI config HID_SENSOR_GYRO_3D depends on HID_SENSOR_HUB select IIO_BUFFER - select IIO_TRIGGERED_BUFFER select HID_SENSOR_IIO_COMMON select HID_SENSOR_IIO_TRIGGER tristate "HID Gyroscope 3D" diff --git a/drivers/iio/humidity/Kconfig b/drivers/iio/humidity/Kconfig index 6549fcf6db698..2de5494e7c225 100644 --- a/drivers/iio/humidity/Kconfig +++ b/drivers/iio/humidity/Kconfig @@ -52,7 +52,6 @@ config HID_SENSOR_HUMIDITY tristate "HID Environmental humidity sensor" depends on HID_SENSOR_HUB select IIO_BUFFER - select IIO_TRIGGERED_BUFFER select HID_SENSOR_IIO_COMMON select HID_SENSOR_IIO_TRIGGER help diff --git a/drivers/iio/light/Kconfig b/drivers/iio/light/Kconfig index 33ad4dd0b5c7b..917f9becf9c75 100644 --- a/drivers/iio/light/Kconfig +++ b/drivers/iio/light/Kconfig @@ -256,7 +256,6 @@ config ISL29125 config HID_SENSOR_ALS depends on HID_SENSOR_HUB select IIO_BUFFER - select IIO_TRIGGERED_BUFFER select HID_SENSOR_IIO_COMMON select HID_SENSOR_IIO_TRIGGER tristate "HID ALS" @@ -270,7 +269,6 @@ config HID_SENSOR_ALS config HID_SENSOR_PROX depends on HID_SENSOR_HUB select IIO_BUFFER - select IIO_TRIGGERED_BUFFER select HID_SENSOR_IIO_COMMON select HID_SENSOR_IIO_TRIGGER tristate "HID PROX" diff --git a/drivers/iio/magnetometer/Kconfig b/drivers/iio/magnetometer/Kconfig index 5d4ffd66032e9..74ad5701c6c29 100644 --- a/drivers/iio/magnetometer/Kconfig +++ b/drivers/iio/magnetometer/Kconfig @@ -95,7 +95,6 @@ config MAG3110 config HID_SENSOR_MAGNETOMETER_3D depends on HID_SENSOR_HUB select IIO_BUFFER - select IIO_TRIGGERED_BUFFER select HID_SENSOR_IIO_COMMON select HID_SENSOR_IIO_TRIGGER tristate "HID Magenetometer 3D" diff --git a/drivers/iio/orientation/Kconfig b/drivers/iio/orientation/Kconfig index a505583cc2fda..396cbbb867f4c 100644 --- a/drivers/iio/orientation/Kconfig +++ b/drivers/iio/orientation/Kconfig @@ -9,7 +9,6 @@ menu "Inclinometer sensors" config HID_SENSOR_INCLINOMETER_3D depends on HID_SENSOR_HUB select IIO_BUFFER - select IIO_TRIGGERED_BUFFER select HID_SENSOR_IIO_COMMON select HID_SENSOR_IIO_TRIGGER tristate "HID Inclinometer 3D" @@ -20,7 +19,6 @@ config HID_SENSOR_INCLINOMETER_3D config HID_SENSOR_DEVICE_ROTATION depends on HID_SENSOR_HUB select IIO_BUFFER - select IIO_TRIGGERED_BUFFER select HID_SENSOR_IIO_COMMON select HID_SENSOR_IIO_TRIGGER tristate "HID Device Rotation" diff --git a/drivers/iio/pressure/Kconfig b/drivers/iio/pressure/Kconfig index 689b978db4f95..fc0d3cfca4186 100644 --- a/drivers/iio/pressure/Kconfig +++ b/drivers/iio/pressure/Kconfig @@ -79,7 +79,6 @@ config DPS310 config HID_SENSOR_PRESS depends on HID_SENSOR_HUB select IIO_BUFFER - select IIO_TRIGGERED_BUFFER select HID_SENSOR_IIO_COMMON select HID_SENSOR_IIO_TRIGGER tristate "HID PRESS" diff --git a/drivers/iio/temperature/Kconfig b/drivers/iio/temperature/Kconfig index f1f2a1499c9e2..4df60082c1fa8 100644 --- a/drivers/iio/temperature/Kconfig +++ b/drivers/iio/temperature/Kconfig @@ -45,7 +45,6 @@ config HID_SENSOR_TEMP tristate "HID Environmental temperature sensor" depends on HID_SENSOR_HUB select IIO_BUFFER - select IIO_TRIGGERED_BUFFER select HID_SENSOR_IIO_COMMON select HID_SENSOR_IIO_TRIGGER help From f73c730774d88a14d7b60feee6d0e13570f99499 Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Fri, 23 Apr 2021 05:09:59 +0300 Subject: [PATCH 111/263] iio: gyro: mpu3050: Fix reported temperature value The raw temperature value is a 16-bit signed integer. The sign casting is missing in the code, which results in a wrong temperature reported by userspace tools, fix it. Cc: stable@vger.kernel.org Fixes: 3904b28efb2c ("iio: gyro: Add driver for the MPU-3050 gyroscope") Datasheet: https://www.cdiweb.com/datasheets/invensense/mpu-3000a.pdf Tested-by: Maxim Schwalm # Asus TF700T Tested-by: Svyatoslav Ryhel # Asus TF201 Reported-by: Svyatoslav Ryhel Reviewed-by: Andy Shevchenko Reviewed-by: Linus Walleij Signed-off-by: Dmitry Osipenko Acked-by: Jean-Baptiste Maneyrol Link: https://lore.kernel.org/r/20210423020959.5023-1-digetx@gmail.com Signed-off-by: Jonathan Cameron --- drivers/iio/gyro/mpu3050-core.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/iio/gyro/mpu3050-core.c b/drivers/iio/gyro/mpu3050-core.c index ac90be03332af..f17a935195352 100644 --- a/drivers/iio/gyro/mpu3050-core.c +++ b/drivers/iio/gyro/mpu3050-core.c @@ -272,7 +272,16 @@ static int mpu3050_read_raw(struct iio_dev *indio_dev, case IIO_CHAN_INFO_OFFSET: switch (chan->type) { case IIO_TEMP: - /* The temperature scaling is (x+23000)/280 Celsius */ + /* + * The temperature scaling is (x+23000)/280 Celsius + * for the "best fit straight line" temperature range + * of -30C..85C. The 23000 includes room temperature + * offset of +35C, 280 is the precision scale and x is + * the 16-bit signed integer reported by hardware. + * + * Temperature value itself represents temperature of + * the sensor die. + */ *val = 23000; return IIO_VAL_INT; default: @@ -329,7 +338,7 @@ static int mpu3050_read_raw(struct iio_dev *indio_dev, goto out_read_raw_unlock; } - *val = be16_to_cpu(raw_val); + *val = (s16)be16_to_cpu(raw_val); ret = IIO_VAL_INT; goto out_read_raw_unlock; From 901f84de0e16bde10a72d7eb2f2eb73fcde8fa1a Mon Sep 17 00:00:00 2001 From: Tomasz Duszynski Date: Fri, 23 Apr 2021 10:02:44 +0200 Subject: [PATCH 112/263] iio: core: fix ioctl handlers removal Currently ioctl handlers are removed twice. For the first time during iio_device_unregister() then later on inside iio_device_unregister_eventset() and iio_buffers_free_sysfs_and_mask(). Double free leads to kernel panic. Fix this by not touching ioctl handlers list directly but rather letting code responsible for registration call the matching cleanup routine itself. Fixes: 8dedcc3eee3ac ("iio: core: centralize ioctl() calls to the main chardev") Signed-off-by: Tomasz Duszynski Acked-by: Alexandru Ardelean Cc: Link: https://lore.kernel.org/r/20210423080244.2790-1-tomasz.duszynski@octakon.com Signed-off-by: Jonathan Cameron --- drivers/iio/industrialio-core.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c index d92c58a94fe4f..9e59f5da3d280 100644 --- a/drivers/iio/industrialio-core.c +++ b/drivers/iio/industrialio-core.c @@ -1926,9 +1926,6 @@ EXPORT_SYMBOL(__iio_device_register); **/ void iio_device_unregister(struct iio_dev *indio_dev) { - struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); - struct iio_ioctl_handler *h, *t; - cdev_device_del(&indio_dev->chrdev, &indio_dev->dev); mutex_lock(&indio_dev->info_exist_lock); @@ -1939,9 +1936,6 @@ void iio_device_unregister(struct iio_dev *indio_dev) indio_dev->info = NULL; - list_for_each_entry_safe(h, t, &iio_dev_opaque->ioctl_handlers, entry) - list_del(&h->entry); - iio_device_wakeup_eventset(indio_dev); iio_buffer_wakeup_poll(indio_dev); From af0670b0bf1b116fd729b1b1011cf814bc34e12e Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Mon, 3 May 2021 17:43:50 +0300 Subject: [PATCH 113/263] iio: core: return ENODEV if ioctl is unknown MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the ioctl() mechanism was introduced in IIO core to centralize the registration of all ioctls in one place via commit 8dedcc3eee3ac ("iio: core: centralize ioctl() calls to the main chardev"), the return code was changed from ENODEV to EINVAL, when the ioctl code isn't known. This was done by accident. This change reverts back to the old behavior, where if the ioctl() code isn't known, ENODEV is returned (vs EINVAL). This was brought into perspective by this patch: https://lore.kernel.org/linux-iio/20210428150815.136150-1-paul@crapouillou.net/ Fixes: 8dedcc3eee3ac ("iio: core: centralize ioctl() calls to the main chardev") Signed-off-by: Alexandru Ardelean Reviewed-by: Nuno Sá Tested-by: Paul Cercueil Reviewed-by: Linus Walleij Signed-off-by: Jonathan Cameron --- drivers/iio/industrialio-core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c index 9e59f5da3d280..59efb36db2c7c 100644 --- a/drivers/iio/industrialio-core.c +++ b/drivers/iio/industrialio-core.c @@ -1778,7 +1778,6 @@ static long iio_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (!indio_dev->info) goto out_unlock; - ret = -EINVAL; list_for_each_entry(h, &iio_dev_opaque->ioctl_handlers, entry) { ret = h->ioctl(indio_dev, filp, cmd, arg); if (ret != IIO_IOCTL_UNHANDLED) @@ -1786,7 +1785,7 @@ static long iio_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } if (ret == IIO_IOCTL_UNHANDLED) - ret = -EINVAL; + ret = -ENODEV; out_unlock: mutex_unlock(&indio_dev->info_exist_lock); From af0e1871d79cfbb91f732d2c6fa7558e45c31038 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 7 May 2021 19:30:41 +0100 Subject: [PATCH 114/263] iio: tsl2583: Fix division by a zero lux_val The lux_val returned from tsl2583_get_lux can potentially be zero, so check for this to avoid a division by zero and an overflowed gain_trim_val. Fixes clang scan-build warning: drivers/iio/light/tsl2583.c:345:40: warning: Either the condition 'lux_val<0' is redundant or there is division by zero at line 345. [zerodivcond] Fixes: ac4f6eee8fe8 ("staging: iio: TAOS tsl258x: Device driver") Signed-off-by: Colin Ian King Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/light/tsl2583.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/iio/light/tsl2583.c b/drivers/iio/light/tsl2583.c index 0f787bfc88fc4..c9d8f07a6fcdd 100644 --- a/drivers/iio/light/tsl2583.c +++ b/drivers/iio/light/tsl2583.c @@ -341,6 +341,14 @@ static int tsl2583_als_calibrate(struct iio_dev *indio_dev) return lux_val; } + /* Avoid division by zero of lux_value later on */ + if (lux_val == 0) { + dev_err(&chip->client->dev, + "%s: lux_val of 0 will produce out of range trim_value\n", + __func__); + return -ENODATA; + } + gain_trim_val = (unsigned int)(((chip->als_settings.als_cal_target) * chip->als_settings.als_gain_trim) / lux_val); if ((gain_trim_val < 250) || (gain_trim_val > 4000)) { From b9a0866a5bdf6a4643a52872ada6be6184c6f4f2 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 5 May 2021 01:23:37 +0300 Subject: [PATCH 115/263] usb: typec: ucsi: Put fwnode in any case during ->probe() device_for_each_child_node() bumps a reference counting of a returned variable. We have to balance it whenever we return to the caller. Fixes: c1b0bc2dabfa ("usb: typec: Add support for UCSI interface") Cc: Heikki Krogerus Reviewed-by: Heikki Krogerus Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20210504222337.3151726-1-andy.shevchenko@gmail.com Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/ucsi/ucsi.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/usb/typec/ucsi/ucsi.c b/drivers/usb/typec/ucsi/ucsi.c index 282c3c825c136..0e1cec346e0f8 100644 --- a/drivers/usb/typec/ucsi/ucsi.c +++ b/drivers/usb/typec/ucsi/ucsi.c @@ -999,6 +999,7 @@ static const struct typec_operations ucsi_ops = { .pr_set = ucsi_pr_swap }; +/* Caller must call fwnode_handle_put() after use */ static struct fwnode_handle *ucsi_find_fwnode(struct ucsi_connector *con) { struct fwnode_handle *fwnode; @@ -1033,7 +1034,7 @@ static int ucsi_register_port(struct ucsi *ucsi, int index) command |= UCSI_CONNECTOR_NUMBER(con->num); ret = ucsi_send_command(ucsi, command, &con->cap, sizeof(con->cap)); if (ret < 0) - goto out; + goto out_unlock; if (con->cap.op_mode & UCSI_CONCAP_OPMODE_DRP) cap->data = TYPEC_PORT_DRD; @@ -1151,6 +1152,8 @@ static int ucsi_register_port(struct ucsi *ucsi, int index) trace_ucsi_register_port(con->num, &con->status); out: + fwnode_handle_put(cap->fwnode); +out_unlock: mutex_unlock(&con->lock); return ret; } From e17b02d4970913233d543c79c9c66e72cac05bdd Mon Sep 17 00:00:00 2001 From: Marcel Hamer Date: Tue, 27 Apr 2021 14:21:18 +0200 Subject: [PATCH 116/263] usb: dwc3: omap: improve extcon initialization When extcon is used in combination with dwc3, it is assumed that the dwc3 registers are untouched and as such are only configured if VBUS is valid or ID is tied to ground. In case VBUS is not valid or ID is floating, the registers are not configured as such during driver initialization, causing a wrong default state during boot. If the registers are not in a default state, because they are for instance touched by a boot loader, this can cause for a kernel error. Signed-off-by: Marcel Hamer Link: https://lore.kernel.org/r/20210427122118.1948340-1-marcel@solidxs.se Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/dwc3-omap.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/usb/dwc3/dwc3-omap.c b/drivers/usb/dwc3/dwc3-omap.c index 3db17806e92e7..e196673f5c647 100644 --- a/drivers/usb/dwc3/dwc3-omap.c +++ b/drivers/usb/dwc3/dwc3-omap.c @@ -437,8 +437,13 @@ static int dwc3_omap_extcon_register(struct dwc3_omap *omap) if (extcon_get_state(edev, EXTCON_USB) == true) dwc3_omap_set_mailbox(omap, OMAP_DWC3_VBUS_VALID); + else + dwc3_omap_set_mailbox(omap, OMAP_DWC3_VBUS_OFF); + if (extcon_get_state(edev, EXTCON_USB_HOST) == true) dwc3_omap_set_mailbox(omap, OMAP_DWC3_ID_GROUND); + else + dwc3_omap_set_mailbox(omap, OMAP_DWC3_ID_FLOAT); omap->edev = edev; } From f75297853470627c4ee4e2b80eed40af7441c96b Mon Sep 17 00:00:00 2001 From: Wei Ming Chen Date: Thu, 6 May 2021 20:20:20 +0800 Subject: [PATCH 117/263] docs: usb: function: Modify path name Original path does not exists, so changed to "Documentation/ABI/testing/configfs-usb-gadget" Signed-off-by: Wei Ming Chen Link: https://lore.kernel.org/r/20210506122020.7117-1-jj251510319013@gmail.com Signed-off-by: Greg Kroah-Hartman --- Documentation/usb/gadget_configfs.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/usb/gadget_configfs.rst b/Documentation/usb/gadget_configfs.rst index 158e48dab586d..e4566ffb223f2 100644 --- a/Documentation/usb/gadget_configfs.rst +++ b/Documentation/usb/gadget_configfs.rst @@ -140,7 +140,7 @@ is an arbitrary string allowed in a filesystem, e.g.:: Each function provides its specific set of attributes, with either read-only or read-write access. Where applicable they need to be written to as appropriate. -Please refer to Documentation/ABI/*/configfs-usb-gadget* for more information. +Please refer to Documentation/ABI/testing/configfs-usb-gadget for more information. 4. Associating the functions with their configurations ------------------------------------------------------ From a60a34366e0d09ca002c966dd7c43a68c28b1f82 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 6 May 2021 22:39:10 +0200 Subject: [PATCH 118/263] usb: fotg210-hcd: Fix an error message 'retval' is known to be -ENODEV here. This is a hard-coded default error code which is not useful in the error message. Moreover, another error message is printed at the end of the error handling path. The corresponding error code (-ENOMEM) is more informative. So remove simplify the first error message. While at it, also remove the useless initialization of 'retval'. Fixes: 7d50195f6c50 ("usb: host: Faraday fotg210-hcd driver") Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/94531bcff98e46d4f9c20183a90b7f47f699126c.1620333419.git.christophe.jaillet@wanadoo.fr Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/fotg210-hcd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/usb/host/fotg210-hcd.c b/drivers/usb/host/fotg210-hcd.c index 6cac642520fc8..9c2eda0918e13 100644 --- a/drivers/usb/host/fotg210-hcd.c +++ b/drivers/usb/host/fotg210-hcd.c @@ -5568,7 +5568,7 @@ static int fotg210_hcd_probe(struct platform_device *pdev) struct usb_hcd *hcd; struct resource *res; int irq; - int retval = -ENODEV; + int retval; struct fotg210_hcd *fotg210; if (usb_disabled()) @@ -5588,7 +5588,7 @@ static int fotg210_hcd_probe(struct platform_device *pdev) hcd = usb_create_hcd(&fotg210_fotg210_hc_driver, dev, dev_name(dev)); if (!hcd) { - dev_err(dev, "failed to create hcd with err %d\n", retval); + dev_err(dev, "failed to create hcd\n"); retval = -ENOMEM; goto fail_create_hcd; } From 726c945ab2ebd104631b6105ab455a5bc604a3f1 Mon Sep 17 00:00:00 2001 From: Wan Jiabing Date: Tue, 27 Apr 2021 12:42:19 +0800 Subject: [PATCH 119/263] hwmon: (corsair-psu) Remove unneeded semicolons Fix the following coccicheck warning: ./drivers/hwmon/corsair-psu.c:379:2-3: Unneeded semicolon Remove unneeded semicolons. Signed-off-by: Wan Jiabing Link: https://lore.kernel.org/r/20210427044219.7799-1-wanjiabing@vivo.com Signed-off-by: Guenter Roeck --- drivers/hwmon/corsair-psu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/hwmon/corsair-psu.c b/drivers/hwmon/corsair-psu.c index 3a5807e4a2efb..02298b86b57b6 100644 --- a/drivers/hwmon/corsair-psu.c +++ b/drivers/hwmon/corsair-psu.c @@ -355,7 +355,7 @@ static umode_t corsairpsu_hwmon_power_is_visible(const struct corsairpsu_data *p return 0444; default: return 0; - }; + } } static umode_t corsairpsu_hwmon_in_is_visible(const struct corsairpsu_data *priv, u32 attr, @@ -376,7 +376,7 @@ static umode_t corsairpsu_hwmon_in_is_visible(const struct corsairpsu_data *priv break; default: break; - }; + } return res; } From 5216dff22dc2bbbbe6f00335f9fd2879670e753b Mon Sep 17 00:00:00 2001 From: Eddie James Date: Thu, 29 Apr 2021 10:13:36 -0500 Subject: [PATCH 120/263] hwmon: (occ) Fix poll rate limiting The poll rate limiter time was initialized at zero. This breaks the comparison in time_after if jiffies is large. Switch to storing the next update time rather than the previous time, and initialize the time when the device is probed. Fixes: c10e753d43eb ("hwmon (occ): Add sensor types and versions") Signed-off-by: Eddie James Link: https://lore.kernel.org/r/20210429151336.18980-1-eajames@linux.ibm.com Signed-off-by: Guenter Roeck --- drivers/hwmon/occ/common.c | 5 +++-- drivers/hwmon/occ/common.h | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/hwmon/occ/common.c b/drivers/hwmon/occ/common.c index f1ac153d0b568..967532afb1c01 100644 --- a/drivers/hwmon/occ/common.c +++ b/drivers/hwmon/occ/common.c @@ -217,9 +217,9 @@ int occ_update_response(struct occ *occ) return rc; /* limit the maximum rate of polling the OCC */ - if (time_after(jiffies, occ->last_update + OCC_UPDATE_FREQUENCY)) { + if (time_after(jiffies, occ->next_update)) { rc = occ_poll(occ); - occ->last_update = jiffies; + occ->next_update = jiffies + OCC_UPDATE_FREQUENCY; } else { rc = occ->last_error; } @@ -1165,6 +1165,7 @@ int occ_setup(struct occ *occ, const char *name) return rc; } + occ->next_update = jiffies + OCC_UPDATE_FREQUENCY; occ_parse_poll_response(occ); rc = occ_setup_sensor_attrs(occ); diff --git a/drivers/hwmon/occ/common.h b/drivers/hwmon/occ/common.h index 67e6968b8978e..e6df719770e81 100644 --- a/drivers/hwmon/occ/common.h +++ b/drivers/hwmon/occ/common.h @@ -99,7 +99,7 @@ struct occ { u8 poll_cmd_data; /* to perform OCC poll command */ int (*send_cmd)(struct occ *occ, u8 *cmd); - unsigned long last_update; + unsigned long next_update; struct mutex lock; /* lock OCC access */ struct device *hwmon; From 2d101db3e5be3bbee6001d4227705cec70ecb82e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=A1clav=20Kubern=C3=A1t?= Date: Thu, 29 Apr 2021 09:53:38 +0200 Subject: [PATCH 121/263] hwmon: (pmbus/fsp-3y) Fix FSP-3Y YH-5151E non-compliant vout encoding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I didn't properly test the driver for YH-5151E, so it was completely broken. Firstly, the log/real mapping was incorrect in one case. Secondly, PMBus specifies that output voltages should be in the linear16 encoding. However, the YH-5151E is non-compliant and uses linear11. YM-2151E isn't affected by this. Fix this by converting the values inside the read functions. linear16 gets the exponent from the VOUT_MODE command. The device doesn't support it, so I have to manually supply the value for it. Both supported devices have now been tested to report correct vout values. Fixes: 1734b4135a62 ("hwmon: Add driver for fsp-3y PSUs and PDUs") Signed-off-by: Václav Kubernát Link: https://lore.kernel.org/r/20210429075337.110502-1-kubernat@cesnet.cz Signed-off-by: Guenter Roeck --- drivers/hwmon/pmbus/fsp-3y.c | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/drivers/hwmon/pmbus/fsp-3y.c b/drivers/hwmon/pmbus/fsp-3y.c index b177987286ae0..e248424752545 100644 --- a/drivers/hwmon/pmbus/fsp-3y.c +++ b/drivers/hwmon/pmbus/fsp-3y.c @@ -57,7 +57,7 @@ static int page_log_to_page_real(int page_log, enum chips chip) case YH5151E_PAGE_12V_LOG: return YH5151E_PAGE_12V_REAL; case YH5151E_PAGE_5V_LOG: - return YH5151E_PAGE_5V_LOG; + return YH5151E_PAGE_5V_REAL; case YH5151E_PAGE_3V3_LOG: return YH5151E_PAGE_3V3_REAL; } @@ -103,8 +103,18 @@ static int set_page(struct i2c_client *client, int page_log) static int fsp3y_read_byte_data(struct i2c_client *client, int page, int reg) { + const struct pmbus_driver_info *info = pmbus_get_driver_info(client); + struct fsp3y_data *data = to_fsp3y_data(info); int rv; + /* + * YH5151-E outputs vout in linear11. The conversion is done when + * reading. Here, we have to inject pmbus_core with the correct + * exponent (it is -6). + */ + if (data->chip == yh5151e && reg == PMBUS_VOUT_MODE) + return 0x1A; + rv = set_page(client, page); if (rv < 0) return rv; @@ -114,6 +124,8 @@ static int fsp3y_read_byte_data(struct i2c_client *client, int page, int reg) static int fsp3y_read_word_data(struct i2c_client *client, int page, int phase, int reg) { + const struct pmbus_driver_info *info = pmbus_get_driver_info(client); + struct fsp3y_data *data = to_fsp3y_data(info); int rv; /* @@ -144,7 +156,18 @@ static int fsp3y_read_word_data(struct i2c_client *client, int page, int phase, if (rv < 0) return rv; - return i2c_smbus_read_word_data(client, reg); + rv = i2c_smbus_read_word_data(client, reg); + if (rv < 0) + return rv; + + /* + * YH-5151E is non-compliant and outputs output voltages in linear11 + * instead of linear16. + */ + if (data->chip == yh5151e && reg == PMBUS_READ_VOUT) + rv = sign_extend32(rv, 10) & 0xffff; + + return rv; } static struct pmbus_driver_info fsp3y_info[] = { From 1f4642b72be79757f050924a9b9673b6a02034bc Mon Sep 17 00:00:00 2001 From: Jack Pham Date: Mon, 3 May 2021 00:46:11 -0700 Subject: [PATCH 122/263] usb: typec: ucsi: Retrieve all the PDOs instead of just the first 4 commit 4dbc6a4ef06d ("usb: typec: ucsi: save power data objects in PD mode") introduced retrieval of the PDOs when connected to a PD-capable source. But only the first 4 PDOs are received since that is the maximum number that can be fetched at a time given the MESSAGE_IN length limitation (16 bytes). However, as per the PD spec a connected source may advertise up to a maximum of 7 PDOs. If such a source is connected it's possible the PPM could have negotiated a power contract with one of the PDOs at index greater than 4, and would be reflected in the request data object's (RDO) object position field. This would result in an out-of-bounds access when the rdo_index() is used to index into the src_pdos array in ucsi_psy_get_voltage_now(). With the help of the UBSAN -fsanitize=array-bounds checker enabled this exact issue is revealed when connecting to a PD source adapter that advertise 5 PDOs and the PPM enters a contract having selected the 5th one. [ 151.545106][ T70] Unexpected kernel BRK exception at EL1 [ 151.545112][ T70] Internal error: BRK handler: f2005512 [#1] PREEMPT SMP ... [ 151.545499][ T70] pc : ucsi_psy_get_prop+0x208/0x20c [ 151.545507][ T70] lr : power_supply_show_property+0xc0/0x328 ... [ 151.545542][ T70] Call trace: [ 151.545544][ T70] ucsi_psy_get_prop+0x208/0x20c [ 151.545546][ T70] power_supply_uevent+0x1a4/0x2f0 [ 151.545550][ T70] dev_uevent+0x200/0x384 [ 151.545555][ T70] kobject_uevent_env+0x1d4/0x7e8 [ 151.545557][ T70] power_supply_changed_work+0x174/0x31c [ 151.545562][ T70] process_one_work+0x244/0x6f0 [ 151.545564][ T70] worker_thread+0x3e0/0xa64 We can resolve this by instead retrieving and storing up to the maximum of 7 PDOs in the con->src_pdos array. This would involve two calls to the GET_PDOS command. Fixes: 992a60ed0d5e ("usb: typec: ucsi: register with power_supply class") Fixes: 4dbc6a4ef06d ("usb: typec: ucsi: save power data objects in PD mode") Cc: stable@vger.kernel.org Reported-and-tested-by: Subbaraman Narayanamurthy Reviewed-by: Heikki Krogerus Signed-off-by: Jack Pham Link: https://lore.kernel.org/r/20210503074611.30973-1-jackp@codeaurora.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/ucsi/ucsi.c | 41 +++++++++++++++++++++++++++-------- drivers/usb/typec/ucsi/ucsi.h | 6 +++-- 2 files changed, 36 insertions(+), 11 deletions(-) diff --git a/drivers/usb/typec/ucsi/ucsi.c b/drivers/usb/typec/ucsi/ucsi.c index 0e1cec346e0f8..1d8b7df59ff49 100644 --- a/drivers/usb/typec/ucsi/ucsi.c +++ b/drivers/usb/typec/ucsi/ucsi.c @@ -495,7 +495,8 @@ static void ucsi_unregister_altmodes(struct ucsi_connector *con, u8 recipient) } } -static void ucsi_get_pdos(struct ucsi_connector *con, int is_partner) +static int ucsi_get_pdos(struct ucsi_connector *con, int is_partner, + u32 *pdos, int offset, int num_pdos) { struct ucsi *ucsi = con->ucsi; u64 command; @@ -503,17 +504,39 @@ static void ucsi_get_pdos(struct ucsi_connector *con, int is_partner) command = UCSI_COMMAND(UCSI_GET_PDOS) | UCSI_CONNECTOR_NUMBER(con->num); command |= UCSI_GET_PDOS_PARTNER_PDO(is_partner); - command |= UCSI_GET_PDOS_NUM_PDOS(UCSI_MAX_PDOS - 1); + command |= UCSI_GET_PDOS_PDO_OFFSET(offset); + command |= UCSI_GET_PDOS_NUM_PDOS(num_pdos - 1); command |= UCSI_GET_PDOS_SRC_PDOS; - ret = ucsi_send_command(ucsi, command, con->src_pdos, - sizeof(con->src_pdos)); - if (ret < 0) { + ret = ucsi_send_command(ucsi, command, pdos + offset, + num_pdos * sizeof(u32)); + if (ret < 0) dev_err(ucsi->dev, "UCSI_GET_PDOS failed (%d)\n", ret); + if (ret == 0 && offset == 0) + dev_warn(ucsi->dev, "UCSI_GET_PDOS returned 0 bytes\n"); + + return ret; +} + +static void ucsi_get_src_pdos(struct ucsi_connector *con, int is_partner) +{ + int ret; + + /* UCSI max payload means only getting at most 4 PDOs at a time */ + ret = ucsi_get_pdos(con, 1, con->src_pdos, 0, UCSI_MAX_PDOS); + if (ret < 0) return; - } + con->num_pdos = ret / sizeof(u32); /* number of bytes to 32-bit PDOs */ - if (ret == 0) - dev_warn(ucsi->dev, "UCSI_GET_PDOS returned 0 bytes\n"); + if (con->num_pdos < UCSI_MAX_PDOS) + return; + + /* get the remaining PDOs, if any */ + ret = ucsi_get_pdos(con, 1, con->src_pdos, UCSI_MAX_PDOS, + PDO_MAX_OBJECTS - UCSI_MAX_PDOS); + if (ret < 0) + return; + + con->num_pdos += ret / sizeof(u32); } static void ucsi_pwr_opmode_change(struct ucsi_connector *con) @@ -522,7 +545,7 @@ static void ucsi_pwr_opmode_change(struct ucsi_connector *con) case UCSI_CONSTAT_PWR_OPMODE_PD: con->rdo = con->status.request_data_obj; typec_set_pwr_opmode(con->port, TYPEC_PWR_MODE_PD); - ucsi_get_pdos(con, 1); + ucsi_get_src_pdos(con, 1); break; case UCSI_CONSTAT_PWR_OPMODE_TYPEC1_5: con->rdo = 0; diff --git a/drivers/usb/typec/ucsi/ucsi.h b/drivers/usb/typec/ucsi/ucsi.h index 3920e20a9e9ef..cee666790907e 100644 --- a/drivers/usb/typec/ucsi/ucsi.h +++ b/drivers/usb/typec/ucsi/ucsi.h @@ -8,6 +8,7 @@ #include #include #include +#include #include /* -------------------------------------------------------------------------- */ @@ -134,7 +135,9 @@ void ucsi_connector_change(struct ucsi *ucsi, u8 num); /* GET_PDOS command bits */ #define UCSI_GET_PDOS_PARTNER_PDO(_r_) ((u64)(_r_) << 23) +#define UCSI_GET_PDOS_PDO_OFFSET(_r_) ((u64)(_r_) << 24) #define UCSI_GET_PDOS_NUM_PDOS(_r_) ((u64)(_r_) << 32) +#define UCSI_MAX_PDOS (4) #define UCSI_GET_PDOS_SRC_PDOS ((u64)1 << 34) /* -------------------------------------------------------------------------- */ @@ -302,7 +305,6 @@ struct ucsi { #define UCSI_MAX_SVID 5 #define UCSI_MAX_ALTMODES (UCSI_MAX_SVID * 6) -#define UCSI_MAX_PDOS (4) #define UCSI_TYPEC_VSAFE5V 5000 #define UCSI_TYPEC_1_5_CURRENT 1500 @@ -330,7 +332,7 @@ struct ucsi_connector { struct power_supply *psy; struct power_supply_desc psy_desc; u32 rdo; - u32 src_pdos[UCSI_MAX_PDOS]; + u32 src_pdos[PDO_MAX_OBJECTS]; int num_pdos; struct usb_role_switch *usb_role_sw; From c34e85fa69b9f4568f19da3af06c3870dd8fcc50 Mon Sep 17 00:00:00 2001 From: Kyle Tso Date: Fri, 7 May 2021 14:22:59 +0800 Subject: [PATCH 123/263] usb: typec: tcpm: Send DISCOVER_IDENTITY from dedicated work In current design, DISCOVER_IDENTITY is queued to VDM state machine immediately in Ready states and never retries if it fails in the AMS. Move the process to a delayed work so that when it fails for some reasons (e.g. Sink Tx No Go), it can be retried by queueing the work again. Also fix a problem that the vdm_state is not set to a proper state if it is blocked by Collision Avoidance mechanism. Reviewed-by: Guenter Roeck Acked-by: Heikki Krogerus Signed-off-by: Kyle Tso Link: https://lore.kernel.org/r/20210507062300.1945009-2-kyletso@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/tcpm/tcpm.c | 85 ++++++++++++++++++++++++++++++----- 1 file changed, 75 insertions(+), 10 deletions(-) diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c index 68e04e397e924..ae1e84252d38f 100644 --- a/drivers/usb/typec/tcpm/tcpm.c +++ b/drivers/usb/typec/tcpm/tcpm.c @@ -259,6 +259,7 @@ enum frs_typec_current { #define ALTMODE_DISCOVERY_MAX (SVID_DISCOVERY_MAX * MODE_DISCOVERY_MAX) #define GET_SINK_CAP_RETRY_MS 100 +#define SEND_DISCOVER_RETRY_MS 100 struct pd_mode_data { int svid_index; /* current SVID index */ @@ -366,6 +367,8 @@ struct tcpm_port { struct kthread_work vdm_state_machine; struct hrtimer enable_frs_timer; struct kthread_work enable_frs; + struct hrtimer send_discover_timer; + struct kthread_work send_discover_work; bool state_machine_running; bool vdm_sm_running; @@ -1178,6 +1181,16 @@ static void mod_enable_frs_delayed_work(struct tcpm_port *port, unsigned int del } } +static void mod_send_discover_delayed_work(struct tcpm_port *port, unsigned int delay_ms) +{ + if (delay_ms) { + hrtimer_start(&port->send_discover_timer, ms_to_ktime(delay_ms), HRTIMER_MODE_REL); + } else { + hrtimer_cancel(&port->send_discover_timer); + kthread_queue_work(port->wq, &port->send_discover_work); + } +} + static void tcpm_set_state(struct tcpm_port *port, enum tcpm_state state, unsigned int delay_ms) { @@ -1855,6 +1868,9 @@ static void vdm_run_state_machine(struct tcpm_port *port) res = tcpm_ams_start(port, DISCOVER_IDENTITY); if (res == 0) port->send_discover = false; + else if (res == -EAGAIN) + mod_send_discover_delayed_work(port, + SEND_DISCOVER_RETRY_MS); break; case CMD_DISCOVER_SVID: res = tcpm_ams_start(port, DISCOVER_SVIDS); @@ -1880,6 +1896,7 @@ static void vdm_run_state_machine(struct tcpm_port *port) } if (res < 0) { + port->vdm_state = VDM_STATE_ERR_BUSY; port->vdm_sm_running = false; return; } @@ -3682,14 +3699,6 @@ static inline enum tcpm_state unattached_state(struct tcpm_port *port) return SNK_UNATTACHED; } -static void tcpm_check_send_discover(struct tcpm_port *port) -{ - if ((port->data_role == TYPEC_HOST || port->negotiated_rev > PD_REV20) && - port->send_discover && port->pd_capable) - tcpm_send_vdm(port, USB_SID_PD, CMD_DISCOVER_IDENT, NULL, 0); - port->send_discover = false; -} - static void tcpm_swap_complete(struct tcpm_port *port, int result) { if (port->swap_pending) { @@ -3926,7 +3935,18 @@ static void run_state_machine(struct tcpm_port *port) break; } - tcpm_check_send_discover(port); + /* + * 6.4.4.3.1 Discover Identity + * "The Discover Identity Command Shall only be sent to SOP when there is an + * Explicit Contract." + * For now, this driver only supports SOP for DISCOVER_IDENTITY, thus using + * port->explicit_contract to decide whether to send the command. + */ + if (port->explicit_contract) + mod_send_discover_delayed_work(port, 0); + else + port->send_discover = false; + /* * 6.3.5 * Sending ping messages is not necessary if @@ -4194,7 +4214,18 @@ static void run_state_machine(struct tcpm_port *port) break; } - tcpm_check_send_discover(port); + /* + * 6.4.4.3.1 Discover Identity + * "The Discover Identity Command Shall only be sent to SOP when there is an + * Explicit Contract." + * For now, this driver only supports SOP for DISCOVER_IDENTITY, thus using + * port->explicit_contract. + */ + if (port->explicit_contract) + mod_send_discover_delayed_work(port, 0); + else + port->send_discover = false; + power_supply_changed(port->psy); break; @@ -5288,6 +5319,29 @@ static void tcpm_enable_frs_work(struct kthread_work *work) mutex_unlock(&port->lock); } +static void tcpm_send_discover_work(struct kthread_work *work) +{ + struct tcpm_port *port = container_of(work, struct tcpm_port, send_discover_work); + + mutex_lock(&port->lock); + /* No need to send DISCOVER_IDENTITY anymore */ + if (!port->send_discover) + goto unlock; + + /* Retry if the port is not idle */ + if ((port->state != SRC_READY && port->state != SNK_READY) || port->vdm_sm_running) { + mod_send_discover_delayed_work(port, SEND_DISCOVER_RETRY_MS); + goto unlock; + } + + /* Only send the Message if the port is host for PD rev2.0 */ + if (port->data_role == TYPEC_HOST || port->negotiated_rev > PD_REV20) + tcpm_send_vdm(port, USB_SID_PD, CMD_DISCOVER_IDENT, NULL, 0); + +unlock: + mutex_unlock(&port->lock); +} + static int tcpm_dr_set(struct typec_port *p, enum typec_data_role data) { struct tcpm_port *port = typec_get_drvdata(p); @@ -6093,6 +6147,14 @@ static enum hrtimer_restart enable_frs_timer_handler(struct hrtimer *timer) return HRTIMER_NORESTART; } +static enum hrtimer_restart send_discover_timer_handler(struct hrtimer *timer) +{ + struct tcpm_port *port = container_of(timer, struct tcpm_port, send_discover_timer); + + kthread_queue_work(port->wq, &port->send_discover_work); + return HRTIMER_NORESTART; +} + struct tcpm_port *tcpm_register_port(struct device *dev, struct tcpc_dev *tcpc) { struct tcpm_port *port; @@ -6123,12 +6185,15 @@ struct tcpm_port *tcpm_register_port(struct device *dev, struct tcpc_dev *tcpc) kthread_init_work(&port->vdm_state_machine, vdm_state_machine_work); kthread_init_work(&port->event_work, tcpm_pd_event_handler); kthread_init_work(&port->enable_frs, tcpm_enable_frs_work); + kthread_init_work(&port->send_discover_work, tcpm_send_discover_work); hrtimer_init(&port->state_machine_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); port->state_machine_timer.function = state_machine_timer_handler; hrtimer_init(&port->vdm_state_machine_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); port->vdm_state_machine_timer.function = vdm_state_machine_timer_handler; hrtimer_init(&port->enable_frs_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); port->enable_frs_timer.function = enable_frs_timer_handler; + hrtimer_init(&port->send_discover_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + port->send_discover_timer.function = send_discover_timer_handler; spin_lock_init(&port->pd_event_lock); From f1fbd950b59b67bc5c202216c8e1c6ca8c99a3b4 Mon Sep 17 00:00:00 2001 From: Kyle Tso Date: Fri, 7 May 2021 14:23:00 +0800 Subject: [PATCH 124/263] usb: typec: tcpm: Fix wrong handling for Not_Supported in VDM AMS Not_Supported Message is acceptable in VDM AMS. Redirect the VDM state machine to VDM_STATE_DONE when receiving Not_Supported and finish the VDM AMS. Also, after the loop in vdm_state_machine_work, add more conditions of VDM states to clear the vdm_sm_running flag because those are all stopping states when leaving the loop. In addition, finish the VDM AMS if the port partner responds BUSY. Fixes: 8dea75e11380 ("usb: typec: tcpm: Protocol Error handling") Fixes: 8d3a0578ad1a ("usb: typec: tcpm: Respond Wait if VDM state machine is running") Reviewed-by: Guenter Roeck Signed-off-by: Kyle Tso Link: https://lore.kernel.org/r/20210507062300.1945009-3-kyletso@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/tcpm/tcpm.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c index ae1e84252d38f..db567e6fde924 100644 --- a/drivers/usb/typec/tcpm/tcpm.c +++ b/drivers/usb/typec/tcpm/tcpm.c @@ -1897,7 +1897,6 @@ static void vdm_run_state_machine(struct tcpm_port *port) if (res < 0) { port->vdm_state = VDM_STATE_ERR_BUSY; - port->vdm_sm_running = false; return; } } @@ -1913,6 +1912,7 @@ static void vdm_run_state_machine(struct tcpm_port *port) port->vdo_data[0] = port->vdo_retry; port->vdo_count = 1; port->vdm_state = VDM_STATE_READY; + tcpm_ams_finish(port); break; case VDM_STATE_BUSY: port->vdm_state = VDM_STATE_ERR_TMOUT; @@ -1978,7 +1978,7 @@ static void vdm_state_machine_work(struct kthread_work *work) port->vdm_state != VDM_STATE_BUSY && port->vdm_state != VDM_STATE_SEND_MESSAGE); - if (port->vdm_state == VDM_STATE_ERR_TMOUT) + if (port->vdm_state < VDM_STATE_READY) port->vdm_sm_running = false; mutex_unlock(&port->lock); @@ -2569,6 +2569,16 @@ static void tcpm_pd_ctrl_request(struct tcpm_port *port, port->sink_cap_done = true; tcpm_set_state(port, ready_state(port), 0); break; + case SRC_READY: + case SNK_READY: + if (port->vdm_state > VDM_STATE_READY) { + port->vdm_state = VDM_STATE_DONE; + if (tcpm_vdm_ams(port)) + tcpm_ams_finish(port); + mod_vdm_delayed_work(port, 0); + break; + } + fallthrough; default: tcpm_pd_handle_state(port, port->pwr_role == TYPEC_SOURCE ? From d9ff1096a840dddea3d5cfa2149ff7da9f499fb2 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Tue, 4 May 2021 22:26:29 +0200 Subject: [PATCH 125/263] usb: musb: Fix an error message 'ret' is known to be 0 here. Initialize 'ret' with the expected error code before using it. Fixes: 0990366bab3c ("usb: musb: Add support for MediaTek musb controller") Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/69f514dc7134e3c917cad208e73cc650cb9e2bd6.1620159879.git.christophe.jaillet@wanadoo.fr Signed-off-by: Greg Kroah-Hartman --- drivers/usb/musb/mediatek.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/musb/mediatek.c b/drivers/usb/musb/mediatek.c index eebeadd269461..6b92d037d8fc8 100644 --- a/drivers/usb/musb/mediatek.c +++ b/drivers/usb/musb/mediatek.c @@ -518,8 +518,8 @@ static int mtk_musb_probe(struct platform_device *pdev) glue->xceiv = devm_usb_get_phy(dev, USB_PHY_TYPE_USB2); if (IS_ERR(glue->xceiv)) { - dev_err(dev, "fail to getting usb-phy %d\n", ret); ret = PTR_ERR(glue->xceiv); + dev_err(dev, "fail to getting usb-phy %d\n", ret); goto err_unregister_usb_phy; } From 28ec344bb8911bb0d4910456b22ba0dd4f662521 Mon Sep 17 00:00:00 2001 From: Saravana Kannan Date: Wed, 5 May 2021 17:44:22 -0700 Subject: [PATCH 126/263] usb: typec: tcpm: Don't block probing of consumers of "connector" nodes fw_devlink expects DT device nodes with "compatible" property to have struct devices created for them. Since the connector node might not be populated as a device, mark it as such so that fw_devlink knows not to wait on this fwnode being populated as a struct device. Without this patch, USB functionality can be broken on some boards. Fixes: f7514a663016 ("of: property: fw_devlink: Add support for remote-endpoint") Reported-by: John Stultz Tested-by: John Stultz Signed-off-by: Saravana Kannan Link: https://lore.kernel.org/r/20210506004423.345199-1-saravanak@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/core.c | 3 ++- drivers/usb/typec/tcpm/tcpm.c | 9 +++++++++ include/linux/fwnode.h | 1 + 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/base/core.c b/drivers/base/core.c index 4a8bf8cda52bc..628e33939acae 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -150,7 +150,7 @@ void fwnode_links_purge(struct fwnode_handle *fwnode) fwnode_links_purge_consumers(fwnode); } -static void fw_devlink_purge_absent_suppliers(struct fwnode_handle *fwnode) +void fw_devlink_purge_absent_suppliers(struct fwnode_handle *fwnode) { struct fwnode_handle *child; @@ -164,6 +164,7 @@ static void fw_devlink_purge_absent_suppliers(struct fwnode_handle *fwnode) fwnode_for_each_available_child_node(fwnode, child) fw_devlink_purge_absent_suppliers(child); } +EXPORT_SYMBOL_GPL(fw_devlink_purge_absent_suppliers); #ifdef CONFIG_SRCU static DEFINE_MUTEX(device_links_lock); diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c index c4fdc00a3bc8f..bffa342d4e386 100644 --- a/drivers/usb/typec/tcpm/tcpm.c +++ b/drivers/usb/typec/tcpm/tcpm.c @@ -5754,6 +5754,15 @@ static int tcpm_fw_get_caps(struct tcpm_port *port, if (!fwnode) return -EINVAL; + /* + * This fwnode has a "compatible" property, but is never populated as a + * struct device. Instead we simply parse it to read the properties. + * This it breaks fw_devlink=on. To maintain backward compatibility + * with existing DT files, we work around this by deleting any + * fwnode_links to/from this fwnode. + */ + fw_devlink_purge_absent_suppliers(fwnode); + /* USB data support is optional */ ret = fwnode_property_read_string(fwnode, "data-role", &cap_str); if (ret == 0) { diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h index ed4e67a7ff1c4..59828516ebaf1 100644 --- a/include/linux/fwnode.h +++ b/include/linux/fwnode.h @@ -187,5 +187,6 @@ extern u32 fw_devlink_get_flags(void); extern bool fw_devlink_is_strict(void); int fwnode_link_add(struct fwnode_handle *con, struct fwnode_handle *sup); void fwnode_links_purge(struct fwnode_handle *fwnode); +void fw_devlink_purge_absent_suppliers(struct fwnode_handle *fwnode); #endif From 8370e5b093080c03cf89f7ebf0bef6984545429e Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 10 May 2021 13:01:36 +0300 Subject: [PATCH 127/263] hwmon: (ltc2992) Put fwnode in error case during ->probe() In each iteration fwnode_for_each_available_child_node() bumps a reference counting of a loop variable followed by dropping in on a next iteration, Since in error case the loop is broken, we have to drop a reference count by ourselves. Do it for port_fwnode in error case during ->probe(). Fixes: b0bd407e94b0 ("hwmon: (ltc2992) Add support") Cc: Alexandru Tachici Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20210510100136.3303142-1-andy.shevchenko@gmail.com Signed-off-by: Guenter Roeck --- drivers/hwmon/ltc2992.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/hwmon/ltc2992.c b/drivers/hwmon/ltc2992.c index 4382105bf1420..2a4bed0ab226b 100644 --- a/drivers/hwmon/ltc2992.c +++ b/drivers/hwmon/ltc2992.c @@ -900,11 +900,15 @@ static int ltc2992_parse_dt(struct ltc2992_state *st) fwnode_for_each_available_child_node(fwnode, child) { ret = fwnode_property_read_u32(child, "reg", &addr); - if (ret < 0) + if (ret < 0) { + fwnode_handle_put(child); return ret; + } - if (addr > 1) + if (addr > 1) { + fwnode_handle_put(child); return -EINVAL; + } ret = fwnode_property_read_u32(child, "shunt-resistor-micro-ohms", &val); if (!ret) From 63c8af5687f6b1b70e9458cac1ffb25e86db1695 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 10 May 2021 08:48:06 +0900 Subject: [PATCH 128/263] block: uapi: fix comment about block device ioctl Fix the comment mentioning ioctl command range used for zoned block devices to reflect the range of commands actually implemented. Signed-off-by: Damien Le Moal Link: https://lore.kernel.org/r/20210509234806.3000-1-damien.lemoal@wdc.com Signed-off-by: Jens Axboe --- include/uapi/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index f44eb0a04afdd..4c32e97dcdf00 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -185,7 +185,7 @@ struct fsxattr { #define BLKROTATIONAL _IO(0x12,126) #define BLKZEROOUT _IO(0x12,127) /* - * A jump here: 130-131 are reserved for zoned block devices + * A jump here: 130-136 are reserved for zoned block devices * (see uapi/linux/blkzoned.h) */ From 0c8bd174f0fc131bc9dfab35cd8784f59045da87 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 8 May 2021 09:23:09 +0200 Subject: [PATCH 129/263] ACPI: scan: Fix a memory leak in an error handling path If 'acpi_device_set_name()' fails, we must free 'acpi_device_bus_id->bus_id' or there is a (potential) memory leak. Fixes: eb50aaf960e3 ("ACPI: scan: Use unique number for instance_no") Signed-off-by: Christophe JAILLET Reviewed-by: Andy Shevchenko Signed-off-by: Rafael J. Wysocki --- drivers/acpi/scan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index a22778e880c22..651a431e2bbf1 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -700,6 +700,7 @@ int acpi_device_add(struct acpi_device *device, result = acpi_device_set_name(device, acpi_device_bus_id); if (result) { + kfree_const(acpi_device_bus_id->bus_id); kfree(acpi_device_bus_id); goto err_unlock; } From c745253e2a691a40c66790defe85c104a887e14a Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Wed, 5 May 2021 14:09:15 +0300 Subject: [PATCH 130/263] PM: runtime: Fix unpaired parent child_count for force_resume As pm_runtime_need_not_resume() relies also on usage_count, it can return a different value in pm_runtime_force_suspend() compared to when called in pm_runtime_force_resume(). Different return values can happen if anything calls PM runtime functions in between, and causes the parent child_count to increase on every resume. So far I've seen the issue only for omapdrm that does complicated things with PM runtime calls during system suspend for legacy reasons: omap_atomic_commit_tail() for omapdrm.0 dispc_runtime_get() wakes up 58000000.dss as it's the dispc parent dispc_runtime_resume() rpm_resume() increases parent child_count dispc_runtime_put() won't idle, PM runtime suspend blocked pm_runtime_force_suspend() for 58000000.dss, !pm_runtime_need_not_resume() __update_runtime_status() system suspended pm_runtime_force_resume() for 58000000.dss, pm_runtime_need_not_resume() pm_runtime_enable() only called because of pm_runtime_need_not_resume() omap_atomic_commit_tail() for omapdrm.0 dispc_runtime_get() wakes up 58000000.dss as it's the dispc parent dispc_runtime_resume() rpm_resume() increases parent child_count dispc_runtime_put() won't idle, PM runtime suspend blocked ... rpm_suspend for 58000000.dss but parent child_count is now unbalanced Let's fix the issue by adding a flag for needs_force_resume and use it in pm_runtime_force_resume() instead of pm_runtime_need_not_resume(). Additionally omapdrm system suspend could be simplified later on to avoid lots of unnecessary PM runtime calls and the complexity it adds. The driver can just use internal functions that are shared between the PM runtime and system suspend related functions. Fixes: 4918e1f87c5f ("PM / runtime: Rework pm_runtime_force_suspend/resume()") Signed-off-by: Tony Lindgren Reviewed-by: Ulf Hansson Tested-by: Tomi Valkeinen Cc: 4.16+ # 4.16+ Signed-off-by: Rafael J. Wysocki --- drivers/base/power/runtime.c | 10 +++++++--- include/linux/pm.h | 1 + 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index 1fc1a992f90ca..b570848d23e0e 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -1637,6 +1637,7 @@ void pm_runtime_init(struct device *dev) dev->power.request_pending = false; dev->power.request = RPM_REQ_NONE; dev->power.deferred_resume = false; + dev->power.needs_force_resume = 0; INIT_WORK(&dev->power.work, pm_runtime_work); dev->power.timer_expires = 0; @@ -1804,10 +1805,12 @@ int pm_runtime_force_suspend(struct device *dev) * its parent, but set its status to RPM_SUSPENDED anyway in case this * function will be called again for it in the meantime. */ - if (pm_runtime_need_not_resume(dev)) + if (pm_runtime_need_not_resume(dev)) { pm_runtime_set_suspended(dev); - else + } else { __update_runtime_status(dev, RPM_SUSPENDED); + dev->power.needs_force_resume = 1; + } return 0; @@ -1834,7 +1837,7 @@ int pm_runtime_force_resume(struct device *dev) int (*callback)(struct device *); int ret = 0; - if (!pm_runtime_status_suspended(dev) || pm_runtime_need_not_resume(dev)) + if (!pm_runtime_status_suspended(dev) || !dev->power.needs_force_resume) goto out; /* @@ -1853,6 +1856,7 @@ int pm_runtime_force_resume(struct device *dev) pm_runtime_mark_last_busy(dev); out: + dev->power.needs_force_resume = 0; pm_runtime_enable(dev); return ret; } diff --git a/include/linux/pm.h b/include/linux/pm.h index c9657408fee1a..1d8209c09686c 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -601,6 +601,7 @@ struct dev_pm_info { unsigned int idle_notification:1; unsigned int request_pending:1; unsigned int deferred_resume:1; + unsigned int needs_force_resume:1; unsigned int runtime_auto:1; bool ignore_children:1; unsigned int no_callbacks:1; From 37a8024d265564eba680575df6421f19db21dfce Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Fri, 7 May 2021 11:59:05 -0700 Subject: [PATCH 131/263] arm64: mte: initialize RGSR_EL1.SEED in __cpu_setup A valid implementation choice for the ChooseRandomNonExcludedTag() pseudocode function used by IRG is to behave in the same way as with GCR_EL1.RRND=0. This would mean that RGSR_EL1.SEED is used as an LFSR which must have a non-zero value in order for IRG to properly produce pseudorandom numbers. However, RGSR_EL1 is reset to an UNKNOWN value on soft reset and thus may reset to 0. Therefore we must initialize RGSR_EL1.SEED to a non-zero value in order to ensure that IRG behaves as expected. Signed-off-by: Peter Collingbourne Fixes: 3b714d24ef17 ("arm64: mte: CPU feature detection and initial sysreg configuration") Cc: # 5.10 Link: https://linux-review.googlesource.com/id/I2b089b6c7d6f17ee37e2f0db7df5ad5bcc04526c Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20210507185905.1745402-1-pcc@google.com Signed-off-by: Catalin Marinas --- arch/arm64/mm/proc.S | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 0a48191534ff6..97d7bcd8d4f26 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -447,6 +447,18 @@ SYM_FUNC_START(__cpu_setup) mov x10, #(SYS_GCR_EL1_RRND | SYS_GCR_EL1_EXCL_MASK) msr_s SYS_GCR_EL1, x10 + /* + * If GCR_EL1.RRND=1 is implemented the same way as RRND=0, then + * RGSR_EL1.SEED must be non-zero for IRG to produce + * pseudorandom numbers. As RGSR_EL1 is UNKNOWN out of reset, we + * must initialize it. + */ + mrs x10, CNTVCT_EL0 + ands x10, x10, #SYS_RGSR_EL1_SEED_MASK + csinc x10, x10, xzr, ne + lsl x10, x10, #SYS_RGSR_EL1_SEED_SHIFT + msr_s SYS_RGSR_EL1, x10 + /* clear any pending tag check faults in TFSR*_EL1 */ msr_s SYS_TFSR_EL1, xzr msr_s SYS_TFSRE0_EL1, xzr From f79f7a2d96769d2a3e663a3e673066be77c30cc3 Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Mon, 22 Mar 2021 17:58:19 +0530 Subject: [PATCH 132/263] arc: Fix typos/spellos s/commiting/committing/ s/defintion/definition/ s/gaurantees/guarantees/ s/interrpted/interrupted/ s/interrutps/interrupts/ s/succeded/succeeded/ s/unconditonally/unconditionally/ Reviewed-by: Christian Brauner Acked-by: Randy Dunlap Signed-off-by: Bhaskar Chowdhury Signed-off-by: Vineet Gupta --- arch/arc/Makefile | 2 +- arch/arc/include/asm/cmpxchg.h | 4 ++-- arch/arc/kernel/process.c | 8 ++++---- arch/arc/kernel/signal.c | 4 ++-- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/arc/Makefile b/arch/arc/Makefile index 4392c9c189c4d..e47adc97a89bf 100644 --- a/arch/arc/Makefile +++ b/arch/arc/Makefile @@ -31,7 +31,7 @@ endif ifdef CONFIG_ARC_CURR_IN_REG -# For a global register defintion, make sure it gets passed to every file +# For a global register definition, make sure it gets passed to every file # We had a customer reported bug where some code built in kernel was NOT using # any kernel headers, and missing the r25 global register # Can't do unconditionally because of recursive include issues diff --git a/arch/arc/include/asm/cmpxchg.h b/arch/arc/include/asm/cmpxchg.h index 9b87e162e539b..dfeffa25499bf 100644 --- a/arch/arc/include/asm/cmpxchg.h +++ b/arch/arc/include/asm/cmpxchg.h @@ -116,7 +116,7 @@ static inline unsigned long __xchg(unsigned long val, volatile void *ptr, * * Technically the lock is also needed for UP (boils down to irq save/restore) * but we can cheat a bit since cmpxchg() atomic_ops_lock() would cause irqs to - * be disabled thus can't possibly be interrpted/preempted/clobbered by xchg() + * be disabled thus can't possibly be interrupted/preempted/clobbered by xchg() * Other way around, xchg is one instruction anyways, so can't be interrupted * as such */ @@ -143,7 +143,7 @@ static inline unsigned long __xchg(unsigned long val, volatile void *ptr, /* * "atomic" variant of xchg() * REQ: It needs to follow the same serialization rules as other atomic_xxx() - * Since xchg() doesn't always do that, it would seem that following defintion + * Since xchg() doesn't always do that, it would seem that following definition * is incorrect. But here's the rationale: * SMP : Even xchg() takes the atomic_ops_lock, so OK. * LLSC: atomic_ops_lock are not relevant at all (even if SMP, since LLSC diff --git a/arch/arc/kernel/process.c b/arch/arc/kernel/process.c index d838d0d576964..3793876f42d9b 100644 --- a/arch/arc/kernel/process.c +++ b/arch/arc/kernel/process.c @@ -50,14 +50,14 @@ SYSCALL_DEFINE3(arc_usr_cmpxchg, int *, uaddr, int, expected, int, new) int ret; /* - * This is only for old cores lacking LLOCK/SCOND, which by defintion + * This is only for old cores lacking LLOCK/SCOND, which by definition * can't possibly be SMP. Thus doesn't need to be SMP safe. * And this also helps reduce the overhead for serializing in * the UP case */ WARN_ON_ONCE(IS_ENABLED(CONFIG_SMP)); - /* Z indicates to userspace if operation succeded */ + /* Z indicates to userspace if operation succeeded */ regs->status32 &= ~STATUS_Z_MASK; ret = access_ok(uaddr, sizeof(*uaddr)); @@ -107,7 +107,7 @@ SYSCALL_DEFINE3(arc_usr_cmpxchg, int *, uaddr, int, expected, int, new) void arch_cpu_idle(void) { - /* Re-enable interrupts <= default irq priority before commiting SLEEP */ + /* Re-enable interrupts <= default irq priority before committing SLEEP */ const unsigned int arg = 0x10 | ARCV2_IRQ_DEF_PRIO; __asm__ __volatile__( @@ -120,7 +120,7 @@ void arch_cpu_idle(void) void arch_cpu_idle(void) { - /* sleep, but enable both set E1/E2 (levels of interrutps) before committing */ + /* sleep, but enable both set E1/E2 (levels of interrupts) before committing */ __asm__ __volatile__("sleep 0x3 \n"); } diff --git a/arch/arc/kernel/signal.c b/arch/arc/kernel/signal.c index fdbe06c98895e..b3ccb9e5ffe42 100644 --- a/arch/arc/kernel/signal.c +++ b/arch/arc/kernel/signal.c @@ -259,7 +259,7 @@ setup_rt_frame(struct ksignal *ksig, sigset_t *set, struct pt_regs *regs) regs->r2 = (unsigned long)&sf->uc; /* - * small optim to avoid unconditonally calling do_sigaltstack + * small optim to avoid unconditionally calling do_sigaltstack * in sigreturn path, now that we only have rt_sigreturn */ magic = MAGIC_SIGALTSTK; @@ -391,7 +391,7 @@ void do_signal(struct pt_regs *regs) void do_notify_resume(struct pt_regs *regs) { /* - * ASM glue gaurantees that this is only called when returning to + * ASM glue guarantees that this is only called when returning to * user mode */ if (test_thread_flag(TIF_NOTIFY_RESUME)) From 8e97bf39fa0361af3e64739b3766992b9dafa11d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 21 Apr 2021 22:16:53 -0700 Subject: [PATCH 133/263] ARC: kgdb: add 'fallthrough' to prevent a warning Use the 'fallthrough' macro to document that this switch case does indeed fall through to the next case. ../arch/arc/kernel/kgdb.c: In function 'kgdb_arch_handle_exception': ../arch/arc/kernel/kgdb.c:141:6: warning: this statement may fall through [-Wimplicit-fallthrough=] 141 | if (kgdb_hex2long(&ptr, &addr)) | ^ ../arch/arc/kernel/kgdb.c:144:2: note: here 144 | case 'D': | ^~~~ Cc: linux-snps-arc@lists.infradead.org Signed-off-by: Randy Dunlap Signed-off-by: Vineet Gupta --- arch/arc/kernel/kgdb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arc/kernel/kgdb.c b/arch/arc/kernel/kgdb.c index ecfbc42d3a40f..345a0000554cb 100644 --- a/arch/arc/kernel/kgdb.c +++ b/arch/arc/kernel/kgdb.c @@ -140,6 +140,7 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, ptr = &remcomInBuffer[1]; if (kgdb_hex2long(&ptr, &addr)) regs->ret = addr; + fallthrough; case 'D': case 'k': From 3433adc8bd09fc9f29b8baddf33b4ecd1ecd2cdc Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Fri, 23 Apr 2021 12:16:25 -0700 Subject: [PATCH 134/263] ARC: entry: fix off-by-one error in syscall number validation We have NR_syscall syscalls from [0 .. NR_syscall-1]. However the check for invalid syscall number is "> NR_syscall" as opposed to >=. This off-by-one error erronesously allows "NR_syscall" to be treated as valid syscall causeing out-of-bounds access into syscall-call table ensuing a crash (holes within syscall table have a invalid-entry handler but this is beyond the array implementing the table). This problem showed up on v5.6 kernel when testing glibc 2.33 (v5.10 kernel capable, includng faccessat2 syscall 439). The v5.6 kernel has NR_syscalls=439 (0 to 438). Due to the bug, 439 passed by glibc was not handled as -ENOSYS but processed leading to a crash. Link: https://github.com/foss-for-synopsys-dwc-arc-processors/linux/issues/48 Reported-by: Shahab Vahedi Cc: Signed-off-by: Vineet Gupta --- arch/arc/kernel/entry.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arc/kernel/entry.S b/arch/arc/kernel/entry.S index 1743506081da6..2cb8dfe866b66 100644 --- a/arch/arc/kernel/entry.S +++ b/arch/arc/kernel/entry.S @@ -177,7 +177,7 @@ tracesys: ; Do the Sys Call as we normally would. ; Validate the Sys Call number - cmp r8, NR_syscalls + cmp r8, NR_syscalls - 1 mov.hi r0, -ENOSYS bhi tracesys_exit @@ -255,7 +255,7 @@ ENTRY(EV_Trap) ;============ Normal syscall case ; syscall num shd not exceed the total system calls avail - cmp r8, NR_syscalls + cmp r8, NR_syscalls - 1 mov.hi r0, -ENOSYS bhi .Lret_from_system_call From c5f756d8c6265ebb1736a7787231f010a3b782e5 Mon Sep 17 00:00:00 2001 From: Vladimir Isaev Date: Tue, 27 Apr 2021 15:12:37 +0300 Subject: [PATCH 135/263] ARC: mm: PAE: use 40-bit physical page mask 32-bit PAGE_MASK can not be used as a mask for physical addresses when PAE is enabled. PAGE_MASK_PHYS must be used for physical addresses instead of PAGE_MASK. Without this, init gets SIGSEGV if pte_modify was called: | potentially unexpected fatal signal 11. | Path: /bin/busybox | CPU: 0 PID: 1 Comm: init Not tainted 5.12.0-rc5-00003-g1e43c377a79f-dirty | Insn could not be fetched | @No matching VMA found | ECR: 0x00040000 EFA: 0x00000000 ERET: 0x00000000 | STAT: 0x80080082 [IE U ] BTA: 0x00000000 | SP: 0x5f9ffe44 FP: 0x00000000 BLK: 0xaf3d4 | LPS: 0x000d093e LPE: 0x000d0950 LPC: 0x00000000 | r00: 0x00000002 r01: 0x5f9fff14 r02: 0x5f9fff20 | ... | Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b Signed-off-by: Vladimir Isaev Reported-by: kernel test robot Cc: Vineet Gupta Cc: stable@vger.kernel.org Signed-off-by: Vineet Gupta --- arch/arc/include/asm/page.h | 12 ++++++++++++ arch/arc/include/asm/pgtable.h | 12 +++--------- arch/arc/include/uapi/asm/page.h | 1 - arch/arc/mm/ioremap.c | 5 +++-- arch/arc/mm/tlb.c | 2 +- 5 files changed, 19 insertions(+), 13 deletions(-) diff --git a/arch/arc/include/asm/page.h b/arch/arc/include/asm/page.h index ad9b7fe4dba36..4a9d33372fe2b 100644 --- a/arch/arc/include/asm/page.h +++ b/arch/arc/include/asm/page.h @@ -7,6 +7,18 @@ #include +#ifdef CONFIG_ARC_HAS_PAE40 + +#define MAX_POSSIBLE_PHYSMEM_BITS 40 +#define PAGE_MASK_PHYS (0xff00000000ull | PAGE_MASK) + +#else /* CONFIG_ARC_HAS_PAE40 */ + +#define MAX_POSSIBLE_PHYSMEM_BITS 32 +#define PAGE_MASK_PHYS PAGE_MASK + +#endif /* CONFIG_ARC_HAS_PAE40 */ + #ifndef __ASSEMBLY__ #define clear_page(paddr) memset((paddr), 0, PAGE_SIZE) diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h index 163641726a2b9..5878846f00cfe 100644 --- a/arch/arc/include/asm/pgtable.h +++ b/arch/arc/include/asm/pgtable.h @@ -107,8 +107,8 @@ #define ___DEF (_PAGE_PRESENT | _PAGE_CACHEABLE) /* Set of bits not changed in pte_modify */ -#define _PAGE_CHG_MASK (PAGE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_SPECIAL) - +#define _PAGE_CHG_MASK (PAGE_MASK_PHYS | _PAGE_ACCESSED | _PAGE_DIRTY | \ + _PAGE_SPECIAL) /* More Abbrevaited helpers */ #define PAGE_U_NONE __pgprot(___DEF) #define PAGE_U_R __pgprot(___DEF | _PAGE_READ) @@ -132,13 +132,7 @@ #define PTE_BITS_IN_PD0 (_PAGE_GLOBAL | _PAGE_PRESENT | _PAGE_HW_SZ) #define PTE_BITS_RWX (_PAGE_EXECUTE | _PAGE_WRITE | _PAGE_READ) -#ifdef CONFIG_ARC_HAS_PAE40 -#define PTE_BITS_NON_RWX_IN_PD1 (0xff00000000 | PAGE_MASK | _PAGE_CACHEABLE) -#define MAX_POSSIBLE_PHYSMEM_BITS 40 -#else -#define PTE_BITS_NON_RWX_IN_PD1 (PAGE_MASK | _PAGE_CACHEABLE) -#define MAX_POSSIBLE_PHYSMEM_BITS 32 -#endif +#define PTE_BITS_NON_RWX_IN_PD1 (PAGE_MASK_PHYS | _PAGE_CACHEABLE) /************************************************************************** * Mapping of vm_flags (Generic VM) to PTE flags (arch specific) diff --git a/arch/arc/include/uapi/asm/page.h b/arch/arc/include/uapi/asm/page.h index 2a97e2718a219..2a4ad619abfba 100644 --- a/arch/arc/include/uapi/asm/page.h +++ b/arch/arc/include/uapi/asm/page.h @@ -33,5 +33,4 @@ #define PAGE_MASK (~(PAGE_SIZE-1)) - #endif /* _UAPI__ASM_ARC_PAGE_H */ diff --git a/arch/arc/mm/ioremap.c b/arch/arc/mm/ioremap.c index fac4adc902044..95c649fbc95af 100644 --- a/arch/arc/mm/ioremap.c +++ b/arch/arc/mm/ioremap.c @@ -53,9 +53,10 @@ EXPORT_SYMBOL(ioremap); void __iomem *ioremap_prot(phys_addr_t paddr, unsigned long size, unsigned long flags) { + unsigned int off; unsigned long vaddr; struct vm_struct *area; - phys_addr_t off, end; + phys_addr_t end; pgprot_t prot = __pgprot(flags); /* Don't allow wraparound, zero size */ @@ -72,7 +73,7 @@ void __iomem *ioremap_prot(phys_addr_t paddr, unsigned long size, /* Mappings have to be page-aligned */ off = paddr & ~PAGE_MASK; - paddr &= PAGE_MASK; + paddr &= PAGE_MASK_PHYS; size = PAGE_ALIGN(end + 1) - paddr; /* diff --git a/arch/arc/mm/tlb.c b/arch/arc/mm/tlb.c index 9bb3c24f36770..9c7c682472896 100644 --- a/arch/arc/mm/tlb.c +++ b/arch/arc/mm/tlb.c @@ -576,7 +576,7 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long vaddr_unaligned, pte_t *ptep) { unsigned long vaddr = vaddr_unaligned & PAGE_MASK; - phys_addr_t paddr = pte_val(*ptep) & PAGE_MASK; + phys_addr_t paddr = pte_val(*ptep) & PAGE_MASK_PHYS; struct page *page = pfn_to_page(pte_pfn(*ptep)); create_tlb(vma, vaddr, ptep); From 1d5e4640e5df15252398c1b621f6bd432f2d7f17 Mon Sep 17 00:00:00 2001 From: Vladimir Isaev Date: Tue, 27 Apr 2021 15:13:54 +0300 Subject: [PATCH 136/263] ARC: mm: Use max_high_pfn as a HIGHMEM zone border Commit 4af22ded0ecf ("arc: fix memory initialization for systems with two memory banks") fixed highmem, but for the PAE case it causes bug messages: | BUG: Bad page state in process swapper pfn:80000 | page:(ptrval) refcount:0 mapcount:1 mapping:00000000 index:0x0 pfn:0x80000 flags: 0x0() | raw: 00000000 00000100 00000122 00000000 00000000 00000000 00000000 00000000 | raw: 00000000 | page dumped because: nonzero mapcount | Modules linked in: | CPU: 0 PID: 0 Comm: swapper Not tainted 5.12.0-rc5-00003-g1e43c377a79f #1 This is because the fix expects highmem to be always less than lowmem and uses min_low_pfn as an upper zone border for highmem. max_high_pfn should be ok for both highmem and highmem+PAE cases. Fixes: 4af22ded0ecf ("arc: fix memory initialization for systems with two memory banks") Signed-off-by: Vladimir Isaev Cc: Mike Rapoport Cc: stable@vger.kernel.org #5.8 onwards Signed-off-by: Vineet Gupta --- arch/arc/mm/init.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c index 33832e36bdb7d..e2ed355438c96 100644 --- a/arch/arc/mm/init.c +++ b/arch/arc/mm/init.c @@ -157,7 +157,16 @@ void __init setup_arch_memory(void) min_high_pfn = PFN_DOWN(high_mem_start); max_high_pfn = PFN_DOWN(high_mem_start + high_mem_sz); - max_zone_pfn[ZONE_HIGHMEM] = min_low_pfn; + /* + * max_high_pfn should be ok here for both HIGHMEM and HIGHMEM+PAE. + * For HIGHMEM without PAE max_high_pfn should be less than + * min_low_pfn to guarantee that these two regions don't overlap. + * For PAE case highmem is greater than lowmem, so it is natural + * to use max_high_pfn. + * + * In both cases, holes should be handled by pfn_valid(). + */ + max_zone_pfn[ZONE_HIGHMEM] = max_high_pfn; high_memory = (void *)(min_high_pfn << PAGE_SHIFT); From bf9e262fcfa6350269f00a95658f701f2595db13 Mon Sep 17 00:00:00 2001 From: Wan Jiabing Date: Sat, 8 May 2021 11:07:33 +0800 Subject: [PATCH 137/263] docs/zh_CN: Remove obsolete translation file This translation file was replaced by Documentation/translations/zh_CN/admin-guide/security-bugs.rst which was created in commit 2d153571003b ("docs/zh_CN: Add zh_CN/admin-guide/security-bugs.rst"). This is a translation left over from history. Remove it. Signed-off-by: Wan Jiabing Acked-by: Wu XiangCheng Link: https://lore.kernel.org/r/20210508030741.82655-1-wanjiabing@vivo.com Signed-off-by: Jonathan Corbet --- Documentation/translations/zh_CN/SecurityBugs | 50 ------------------- 1 file changed, 50 deletions(-) delete mode 100644 Documentation/translations/zh_CN/SecurityBugs diff --git a/Documentation/translations/zh_CN/SecurityBugs b/Documentation/translations/zh_CN/SecurityBugs deleted file mode 100644 index 2d0fffd122cee..0000000000000 --- a/Documentation/translations/zh_CN/SecurityBugs +++ /dev/null @@ -1,50 +0,0 @@ -Chinese translated version of Documentation/admin-guide/security-bugs.rst - -If you have any comment or update to the content, please contact the -original document maintainer directly. However, if you have a problem -communicating in English you can also ask the Chinese maintainer for -help. Contact the Chinese maintainer if this translation is outdated -or if there is a problem with the translation. - -Chinese maintainer: Harry Wei ---------------------------------------------------------------------- -Documentation/admin-guide/security-bugs.rst 的中文翻译 - -如果想评论或更新本文的内容,请直接联系原文档的维护者。如果你使用英文 -交流有困难的话,也可以向中文版维护者求助。如果本翻译更新不及时或者翻 -译存在问题,请联系中文版维护者。 - -中文版维护者: 贾威威 Harry Wei -中文版翻译者: 贾威威 Harry Wei -中文版校译者: 贾威威 Harry Wei - - -以下为正文 ---------------------------------------------------------------------- -Linux内核开发者认为安全非常重要。因此,我们想要知道当一个有关于 -安全的漏洞被发现的时候,并且它可能会被尽快的修复或者公开。请把这个安全 -漏洞报告给Linux内核安全团队。 - -1) 联系 - -linux内核安全团队可以通过email来联系。这是 -一组独立的安全工作人员,可以帮助改善漏洞报告并且公布和取消一个修复。安 -全团队有可能会从部分的维护者那里引进额外的帮助来了解并且修复安全漏洞。 -当遇到任何漏洞,所能提供的信息越多就越能诊断和修复。如果你不清楚什么 -是有帮助的信息,那就请重温一下admin-guide/reporting-bugs.rst文件中的概述过程。任 -何攻击性的代码都是非常有用的,未经报告者的同意不会被取消,除非它已经 -被公布于众。 - -2) 公开 - -Linux内核安全团队的宗旨就是和漏洞提交者一起处理漏洞的解决方案直 -到公开。我们喜欢尽快地完全公开漏洞。当一个漏洞或者修复还没有被完全地理 -解,解决方案没有通过测试或者供应商协调,可以合理地延迟公开。然而,我们 -期望这些延迟尽可能的短些,是可数的几天,而不是几个星期或者几个月。公开 -日期是通过安全团队和漏洞提供者以及供应商洽谈后的结果。公开时间表是从很 -短(特殊的,它已经被公众所知道)到几个星期。作为一个基本的默认政策,我 -们所期望通知公众的日期是7天的安排。 - -3) 保密协议 - -Linux内核安全团队不是一个正式的团体,因此不能加入任何的保密协议。 From 9e255e2b9afe948fb795cbaa854acc3904d4212c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 6 May 2021 16:19:07 -0700 Subject: [PATCH 138/263] Documentation: drop optional BOMs A few of the Documentation .rst files begin with a Unicode byte order mark (BOM). The BOM may signify endianess for 16-bit or 32-bit encodings or indicate that the text stream is indeed Unicode. We don't need it for either of those uses. It may also interfere with (confuse) some software. Since we don't need it and its use is optional, just delete the uses of it in Documentation/. https://en.wikipedia.org/wiki/Byte_order_mark Signed-off-by: Randy Dunlap Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Greg Kroah-Hartman Cc: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/20210506231907.14359-1-rdunlap@infradead.org Signed-off-by: Jonathan Corbet --- Documentation/block/data-integrity.rst | 2 +- Documentation/process/kernel-enforcement-statement.rst | 2 +- Documentation/security/tpm/xen-tpmfront.rst | 2 +- Documentation/timers/no_hz.rst | 2 +- Documentation/usb/mtouchusb.rst | 2 +- Documentation/usb/usb-serial.rst | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Documentation/block/data-integrity.rst b/Documentation/block/data-integrity.rst index 4f2452a95c434..07a97aa266685 100644 --- a/Documentation/block/data-integrity.rst +++ b/Documentation/block/data-integrity.rst @@ -1,4 +1,4 @@ -============== +============== Data Integrity ============== diff --git a/Documentation/process/kernel-enforcement-statement.rst b/Documentation/process/kernel-enforcement-statement.rst index e5a1be4760476..dc2d813b2e793 100644 --- a/Documentation/process/kernel-enforcement-statement.rst +++ b/Documentation/process/kernel-enforcement-statement.rst @@ -1,4 +1,4 @@ -.. _process_statement_kernel: +.. _process_statement_kernel: Linux Kernel Enforcement Statement ---------------------------------- diff --git a/Documentation/security/tpm/xen-tpmfront.rst b/Documentation/security/tpm/xen-tpmfront.rst index 00d5b1db227d4..31c67522f2ade 100644 --- a/Documentation/security/tpm/xen-tpmfront.rst +++ b/Documentation/security/tpm/xen-tpmfront.rst @@ -1,4 +1,4 @@ -============================= +============================= Virtual TPM interface for Xen ============================= diff --git a/Documentation/timers/no_hz.rst b/Documentation/timers/no_hz.rst index c4c70e1aada3c..6cadad7c3aad4 100644 --- a/Documentation/timers/no_hz.rst +++ b/Documentation/timers/no_hz.rst @@ -1,4 +1,4 @@ -====================================== +====================================== NO_HZ: Reducing Scheduling-Clock Ticks ====================================== diff --git a/Documentation/usb/mtouchusb.rst b/Documentation/usb/mtouchusb.rst index d1111b74bf759..5ae1f74fe74b6 100644 --- a/Documentation/usb/mtouchusb.rst +++ b/Documentation/usb/mtouchusb.rst @@ -1,4 +1,4 @@ -================ +================ mtouchusb driver ================ diff --git a/Documentation/usb/usb-serial.rst b/Documentation/usb/usb-serial.rst index 8fa7dbd3da9a4..69586aeb60bb4 100644 --- a/Documentation/usb/usb-serial.rst +++ b/Documentation/usb/usb-serial.rst @@ -1,4 +1,4 @@ -========== +========== USB serial ========== From 0d3ae948741ac6d80e39ab27b45297367ee477de Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 14 Apr 2021 10:05:17 -0700 Subject: [PATCH 139/263] sh: Remove unused variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Removes this annoying warning: arch/sh/kernel/traps.c: In function ‘nmi_trap_handler’: arch/sh/kernel/traps.c:183:15: warning: unused variable ‘cpu’ [-Wunused-variable] 183 | unsigned int cpu = smp_processor_id(); Fixes: fe3f1d5d7cd3 ("sh: Get rid of nmi_count()") Signed-off-by: Eric Dumazet Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210414170517.1205430-1-eric.dumazet@gmail.com --- arch/sh/kernel/traps.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/sh/kernel/traps.c b/arch/sh/kernel/traps.c index f5beecdac6938..e76b221570999 100644 --- a/arch/sh/kernel/traps.c +++ b/arch/sh/kernel/traps.c @@ -180,7 +180,6 @@ static inline void arch_ftrace_nmi_exit(void) { } BUILD_TRAP_HANDLER(nmi) { - unsigned int cpu = smp_processor_id(); TRAP_HANDLER_DECL; arch_ftrace_nmi_enter(); From bb4031b8af804244a7e4349d38f6624f68664bd6 Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Mon, 26 Apr 2021 09:56:18 +0300 Subject: [PATCH 140/263] clk: Skip clk provider registration when np is NULL commit 6579c8d97ad7 ("clk: Mark fwnodes when their clock provider is added") revealed that clk/bcm/clk-raspberrypi.c driver calls devm_of_clk_add_hw_provider(), with a NULL dev->of_node, which resulted in a NULL pointer dereference in of_clk_add_hw_provider() when calling fwnode_dev_initialized(). Returning 0 is reducing the if conditions in driver code and is being consistent with the CONFIG_OF=n inline stub that returns 0 when CONFIG_OF is disabled. The downside is that drivers will maybe register clkdev lookups when they don't need to and waste some memory. Fixes: 6579c8d97ad7 ("clk: Mark fwnodes when their clock provider is added") Fixes: 3c9ea42802a1 ("clk: Mark fwnodes when their clock provider is added/removed") Reported-by: Marek Szyprowski Tested-by: Guenter Roeck Tested-by: Nathan Chancellor Reviewed-by: Stephen Boyd Reviewed-by: Saravana Kannan Reviewed-by: Nicolas Saenz Julienne Signed-off-by: Tudor Ambarus Link: https://lore.kernel.org/r/20210426065618.588144-1-tudor.ambarus@microchip.com Signed-off-by: Greg Kroah-Hartman --- drivers/clk/clk.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c index e2ec1b7452439..65508eb89ec99 100644 --- a/drivers/clk/clk.c +++ b/drivers/clk/clk.c @@ -4540,6 +4540,9 @@ int of_clk_add_provider(struct device_node *np, struct of_clk_provider *cp; int ret; + if (!np) + return 0; + cp = kzalloc(sizeof(*cp), GFP_KERNEL); if (!cp) return -ENOMEM; @@ -4579,6 +4582,9 @@ int of_clk_add_hw_provider(struct device_node *np, struct of_clk_provider *cp; int ret; + if (!np) + return 0; + cp = kzalloc(sizeof(*cp), GFP_KERNEL); if (!cp) return -ENOMEM; @@ -4676,6 +4682,9 @@ void of_clk_del_provider(struct device_node *np) { struct of_clk_provider *cp; + if (!np) + return; + mutex_lock(&of_clk_mutex); list_for_each_entry(cp, &of_clk_providers, link) { if (cp->node == np) { From 2515dd6ce8e545b0b2eece84920048ef9ed846c4 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Mon, 19 Apr 2021 16:17:41 -0700 Subject: [PATCH 141/263] stack: Replace "o" output with "r" input constraint "o" isn't a common asm() constraint to use; it triggers an assertion in assert-enabled builds of LLVM that it's not recognized when targeting aarch64 (though it appears to fall back to "m"). It's fixed in LLVM 13 now, but there isn't really a good reason to use "o" in particular here. To avoid causing build issues for those using assert-enabled builds of earlier LLVM versions, the constraint needs changing. Instead, if the point is to retain the __builtin_alloca(), make ptr appear to "escape" via being an input to an empty inline asm block. This is preferable anyways, since otherwise this looks like a dead store. While the use of "r" was considered in https://lore.kernel.org/lkml/202104011447.2E7F543@keescook/ it was only tested as an output (which looks like a dead store, and wasn't sufficient). Use "r" as an input constraint instead, which behaves correctly across compilers and architectures. Fixes: 39218ff4c625 ("stack: Optionally randomize kernel stack offset each syscall") Signed-off-by: Nick Desaulniers Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Tested-by: Kees Cook Tested-by: Nathan Chancellor Reviewed-by: Nathan Chancellor Link: https://reviews.llvm.org/D100412 Link: https://bugs.llvm.org/show_bug.cgi?id=49956 Link: https://lore.kernel.org/r/20210419231741.4084415-1-keescook@chromium.org --- include/linux/randomize_kstack.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/randomize_kstack.h b/include/linux/randomize_kstack.h index fd80fab663a96..bebc911161b6f 100644 --- a/include/linux/randomize_kstack.h +++ b/include/linux/randomize_kstack.h @@ -38,7 +38,7 @@ void *__builtin_alloca(size_t size); u32 offset = raw_cpu_read(kstack_offset); \ u8 *ptr = __builtin_alloca(KSTACK_OFFSET_MAX(offset)); \ /* Keep allocation even after "ptr" loses scope. */ \ - asm volatile("" : "=o"(*ptr) :: "memory"); \ + asm volatile("" :: "r"(ptr) : "memory"); \ } \ } while (0) From cc2520909c2df9ad51d642bf09b3da26a9f56393 Mon Sep 17 00:00:00 2001 From: Jernej Skrabec Date: Wed, 5 May 2021 19:33:35 +0200 Subject: [PATCH 142/263] MAINTAINERS: Update my e-mail Old e-mail address doesn't work anymore, update it to new one. Link: https://lore.kernel.org/r/20210505173335.1483575-1-jernej.skrabec@gmail.com Signed-off-by: Jernej Skrabec Signed-off-by: Maxime Ripard --- .mailmap | 1 + MAINTAINERS | 10 +++++----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.mailmap b/.mailmap index 2d93232ed72b8..ca235ef4755f3 100644 --- a/.mailmap +++ b/.mailmap @@ -159,6 +159,7 @@ Jeff Layton Jeff Layton Jens Axboe Jens Osterkamp +Jernej Skrabec Jiri Slaby Jiri Slaby Jiri Slaby diff --git a/MAINTAINERS b/MAINTAINERS index 7fdc513392f45..2e9063d018d0e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1572,7 +1572,7 @@ F: drivers/clk/sunxi/ ARM/Allwinner sunXi SoC support M: Maxime Ripard M: Chen-Yu Tsai -R: Jernej Skrabec +R: Jernej Skrabec L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/sunxi/linux.git @@ -5003,7 +5003,7 @@ S: Maintained F: drivers/net/fddi/defza.* DEINTERLACE DRIVERS FOR ALLWINNER H3 -M: Jernej Skrabec +M: Jernej Skrabec L: linux-media@vger.kernel.org S: Maintained T: git git://linuxtv.org/media_tree.git @@ -5527,7 +5527,7 @@ F: include/linux/power/smartreflex.h DRM DRIVER FOR ALLWINNER DE2 AND DE3 ENGINE M: Maxime Ripard M: Chen-Yu Tsai -R: Jernej Skrabec +R: Jernej Skrabec L: dri-devel@lists.freedesktop.org S: Supported T: git git://anongit.freedesktop.org/drm/drm-misc @@ -5903,7 +5903,7 @@ M: Andrzej Hajda M: Neil Armstrong R: Laurent Pinchart R: Jonas Karlman -R: Jernej Skrabec +R: Jernej Skrabec S: Maintained T: git git://anongit.freedesktop.org/drm/drm-misc F: drivers/gpu/drm/bridge/ @@ -15490,7 +15490,7 @@ F: include/uapi/linux/rose.h F: net/rose/ ROTATION DRIVER FOR ALLWINNER A83T -M: Jernej Skrabec +M: Jernej Skrabec L: linux-media@vger.kernel.org S: Maintained T: git git://linuxtv.org/media_tree.git From 1b55767dfdd93c42712e67e986ac14f0c4debd0c Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Tue, 11 May 2021 00:25:05 +0800 Subject: [PATCH 143/263] erofs: fix broken illustration in documentation Illustration was broken after ReST conversion by accident. (checked by 'make SPHINXDIRS="filesystems" htmldocs') Link: https://lore.kernel.org/r/20210510162506.28637-1-xiang@kernel.org Fixes: e66d8631ddb3 ("docs: filesystems: convert erofs.txt to ReST") Reviewed-by: Chao Yu Cc: Mauro Carvalho Chehab Signed-off-by: Gao Xiang --- Documentation/filesystems/erofs.rst | 119 ++++++++++++++-------------- 1 file changed, 59 insertions(+), 60 deletions(-) diff --git a/Documentation/filesystems/erofs.rst b/Documentation/filesystems/erofs.rst index bf145171c2bf8..869b183ff2158 100644 --- a/Documentation/filesystems/erofs.rst +++ b/Documentation/filesystems/erofs.rst @@ -113,31 +113,31 @@ may not. All metadatas can be now observed in two different spaces (views): :: - |-> aligned with 8B - |-> followed closely - + meta_blkaddr blocks |-> another slot - _____________________________________________________________________ - | ... | inode | xattrs | extents | data inline | ... | inode ... - |________|_______|(optional)|(optional)|__(optional)_|_____|__________ - |-> aligned with the inode slot size - . . - . . - . . - . . - . . - . . - .____________________________________________________|-> aligned with 4B - | xattr_ibody_header | shared xattrs | inline xattrs | - |____________________|_______________|_______________| - |-> 12 bytes <-|->x * 4 bytes<-| . - . . . - . . . - . . . - ._______________________________.______________________. - | id | id | id | id | ... | id | ent | ... | ent| ... | - |____|____|____|____|______|____|_____|_____|____|_____| - |-> aligned with 4B - |-> aligned with 4B + |-> aligned with 8B + |-> followed closely + + meta_blkaddr blocks |-> another slot + _____________________________________________________________________ + | ... | inode | xattrs | extents | data inline | ... | inode ... + |________|_______|(optional)|(optional)|__(optional)_|_____|__________ + |-> aligned with the inode slot size + . . + . . + . . + . . + . . + . . + .____________________________________________________|-> aligned with 4B + | xattr_ibody_header | shared xattrs | inline xattrs | + |____________________|_______________|_______________| + |-> 12 bytes <-|->x * 4 bytes<-| . + . . . + . . . + . . . + ._______________________________.______________________. + | id | id | id | id | ... | id | ent | ... | ent| ... | + |____|____|____|____|______|____|_____|_____|____|_____| + |-> aligned with 4B + |-> aligned with 4B Inode could be 32 or 64 bytes, which can be distinguished from a common field which all inode versions have -- i_format:: @@ -175,13 +175,13 @@ may not. All metadatas can be now observed in two different spaces (views): Each share xattr can also be directly found by the following formula: xattr offset = xattr_blkaddr * block_size + 4 * xattr_id - :: +:: - |-> aligned by 4 bytes - + xattr_blkaddr blocks |-> aligned with 4 bytes - _________________________________________________________________________ - | ... | xattr_entry | xattr data | ... | xattr_entry | xattr data ... - |________|_____________|_____________|_____|______________|_______________ + |-> aligned by 4 bytes + + xattr_blkaddr blocks |-> aligned with 4 bytes + _________________________________________________________________________ + | ... | xattr_entry | xattr data | ... | xattr_entry | xattr data ... + |________|_____________|_____________|_____|______________|_______________ Directories ----------- @@ -193,19 +193,18 @@ algorithm (could refer to the related source code). :: - ___________________________ - / | - / ______________|________________ - / / | nameoff1 | nameoffN-1 - ____________.______________._______________v________________v__________ - | dirent | dirent | ... | dirent | filename | filename | ... | filename | - |___.0___|____1___|_____|___N-1__|____0_____|____1_____|_____|___N-1____| - \ ^ - \ | * could have - \ | trailing '\0' - \________________________| nameoff0 - - Directory block + ___________________________ + / | + / ______________|________________ + / / | nameoff1 | nameoffN-1 + ____________.______________._______________v________________v__________ + | dirent | dirent | ... | dirent | filename | filename | ... | filename | + |___.0___|____1___|_____|___N-1__|____0_____|____1_____|_____|___N-1____| + \ ^ + \ | * could have + \ | trailing '\0' + \________________________| nameoff0 + Directory block Note that apart from the offset of the first filename, nameoff0 also indicates the total number of directory entries in this block since it is no need to @@ -216,22 +215,22 @@ Compression Currently, EROFS supports 4KB fixed-sized output transparent file compression, as illustrated below:: - |---- Variant-Length Extent ----|-------- VLE --------|----- VLE ----- - clusterofs clusterofs clusterofs - | | | logical data - _________v_______________________________v_____________________v_______________ - ... | . | | . | | . | ... - ____|____.________|_____________|________.____|_____________|__.__________|____ - |-> cluster <-|-> cluster <-|-> cluster <-|-> cluster <-|-> cluster <-| - size size size size size - . . . . - . . . . - . . . . - _______._____________._____________._____________._____________________ - ... | | | | ... physical data - _______|_____________|_____________|_____________|_____________________ - |-> cluster <-|-> cluster <-|-> cluster <-| - size size size + |<- variable-sized extent ->|<- VLE ->| + clusterofs clusterofs clusterofs + | | | + _________v_________________________________v_______________________v________ + ... | . | | . | | . ... + ____|____._________|______________|________.___ _|______________|__.________ + |-> lcluster <-|-> lcluster <-|-> lcluster <-|-> lcluster <-| + size size size size . . + . . . . + . . . . + . . . . + _______.______________.______________.______________._________________ + ... | | | | ... + _______|______________|______________|______________|_________________ + |-> pcluster <-|-> pcluster <-|-> pcluster <-| + size size size Currently each on-disk physical cluster can contain 4KB (un)compressed data at most. For each logical cluster, there is a corresponding on-disk index to From 46f2e04484aee056c97f79162da83ac7d2d621bb Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Tue, 11 May 2021 16:44:14 +0800 Subject: [PATCH 144/263] erofs: update documentation about data compression Add more description about (NON)HEAD lclusters, and the new big pcluster feature. Link: https://lore.kernel.org/r/20210511084414.21305-1-xiang@kernel.org Reviewed-by: Chao Yu Signed-off-by: Gao Xiang --- Documentation/filesystems/erofs.rst | 68 +++++++++++++++++++++-------- 1 file changed, 49 insertions(+), 19 deletions(-) diff --git a/Documentation/filesystems/erofs.rst b/Documentation/filesystems/erofs.rst index 869b183ff2158..832839fcf4c3b 100644 --- a/Documentation/filesystems/erofs.rst +++ b/Documentation/filesystems/erofs.rst @@ -50,8 +50,8 @@ Here is the main features of EROFS: - Support POSIX.1e ACLs by using xattrs; - - Support transparent file compression as an option: - LZ4 algorithm with 4 KB fixed-sized output compression for high performance. + - Support transparent data compression as an option: + LZ4 algorithm with the fixed-sized output compression for high performance. The following git tree provides the file system user-space tools under development (ex, formatting tool mkfs.erofs): @@ -210,10 +210,21 @@ Note that apart from the offset of the first filename, nameoff0 also indicates the total number of directory entries in this block since it is no need to introduce another on-disk field at all. -Compression ------------ -Currently, EROFS supports 4KB fixed-sized output transparent file compression, -as illustrated below:: +Data compression +---------------- +EROFS implements LZ4 fixed-sized output compression which generates fixed-sized +compressed data blocks from variable-sized input in contrast to other existing +fixed-sized input solutions. Relatively higher compression ratios can be gotten +by using fixed-sized output compression since nowadays popular data compression +algorithms are mostly LZ77-based and such fixed-sized output approach can be +benefited from the historical dictionary (aka. sliding window). + +In details, original (uncompressed) data is turned into several variable-sized +extents and in the meanwhile, compressed into physical clusters (pclusters). +In order to record each variable-sized extent, logical clusters (lclusters) are +introduced as the basic unit of compress indexes to indicate whether a new +extent is generated within the range (HEAD) or not (NONHEAD). Lclusters are now +fixed in block size, as illustrated below:: |<- variable-sized extent ->|<- VLE ->| clusterofs clusterofs clusterofs @@ -222,18 +233,37 @@ as illustrated below:: ... | . | | . | | . ... ____|____._________|______________|________.___ _|______________|__.________ |-> lcluster <-|-> lcluster <-|-> lcluster <-|-> lcluster <-| - size size size size . . - . . . . - . . . . - . . . . - _______.______________.______________.______________._________________ + (HEAD) (NONHEAD) (HEAD) (NONHEAD) . + . CBLKCNT . . + . . . + . . . + _______._____________________________.______________._________________ ... | | | | ... _______|______________|______________|______________|_________________ - |-> pcluster <-|-> pcluster <-|-> pcluster <-| - size size size - -Currently each on-disk physical cluster can contain 4KB (un)compressed data -at most. For each logical cluster, there is a corresponding on-disk index to -describe its cluster type, physical cluster address, etc. - -See "struct z_erofs_vle_decompressed_index" in erofs_fs.h for more details. + |-> big pcluster <-|-> pcluster <-| + +A physical cluster can be seen as a container of physical compressed blocks +which contains compressed data. Previously, only lcluster-sized (4KB) pclusters +were supported. After big pcluster feature is introduced (available since +Linux v5.13), pcluster can be a multiple of lcluster size. + +For each HEAD lcluster, clusterofs is recorded to indicate where a new extent +starts and blkaddr is used to seek the compressed data. For each NONHEAD +lcluster, delta0 and delta1 are available instead of blkaddr to indicate the +distance to its HEAD lcluster and the next HEAD lcluster. A PLAIN lcluster is +also a HEAD lcluster except that its data is uncompressed. See the comments +around "struct z_erofs_vle_decompressed_index" in erofs_fs.h for more details. + +If big pcluster is enabled, pcluster size in lclusters needs to be recorded as +well. Let the delta0 of the first NONHEAD lcluster store the compressed block +count with a special flag as a new called CBLKCNT NONHEAD lcluster. It's easy +to understand its delta0 is constantly 1, as illustrated below:: + + __________________________________________________________ + | HEAD | NONHEAD | NONHEAD | ... | NONHEAD | HEAD | HEAD | + |__:___|_(CBLKCNT)_|_________|_____|_________|__:___|____:_| + |<----- a big pcluster (with CBLKCNT) ------>|<-- -->| + a lcluster-sized pcluster (without CBLKCNT) ^ + +If another HEAD follows a HEAD lcluster, there is no room to record CBLKCNT, +but it's easy to know the size of such pcluster is 1 lcluster as well. From a5c936add6a23c15c6ae538ab7a12f80751fdf0f Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Wed, 21 Apr 2021 13:20:31 +0800 Subject: [PATCH 145/263] drm/i915/dp: Use slow and wide link training for everything MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Screen flickers on Innolux eDP 1.3 panel when clock rate 540000 is in use. According to the panel vendor, though clock rate 540000 is advertised, but the max clock rate it really supports is 270000. Ville Syrjälä mentioned that fast and narrow also breaks some eDP 1.4 panel, so use slow and wide training for all panels to resolve the issue. User also confirmed that the new strategy doesn't introduce any regression on XPS 9380. v2: - Use slow and wide for everything. Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/3384 References: https://gitlab.freedesktop.org/drm/intel/-/issues/272 Signed-off-by: Kai-Heng Feng Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20210421052054.1434718-1-kai.heng.feng@canonical.com (cherry picked from commit acca7762eb71bc05a8f28d29320d193150051f79) Fixes: 2bbd6dba84d4 ("drm/i915: Try to use fast+narrow link on eDP again and fall back to the old max strategy on failure") Cc: # v5.12+ Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/display/intel_dp.c | 59 +++---------------------- 1 file changed, 5 insertions(+), 54 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c index 6a2dee8cef1f1..1e026177ed1ba 100644 --- a/drivers/gpu/drm/i915/display/intel_dp.c +++ b/drivers/gpu/drm/i915/display/intel_dp.c @@ -1095,44 +1095,6 @@ intel_dp_compute_link_config_wide(struct intel_dp *intel_dp, return -EINVAL; } -/* Optimize link config in order: max bpp, min lanes, min clock */ -static int -intel_dp_compute_link_config_fast(struct intel_dp *intel_dp, - struct intel_crtc_state *pipe_config, - const struct link_config_limits *limits) -{ - const struct drm_display_mode *adjusted_mode = &pipe_config->hw.adjusted_mode; - int bpp, clock, lane_count; - int mode_rate, link_clock, link_avail; - - for (bpp = limits->max_bpp; bpp >= limits->min_bpp; bpp -= 2 * 3) { - int output_bpp = intel_dp_output_bpp(pipe_config->output_format, bpp); - - mode_rate = intel_dp_link_required(adjusted_mode->crtc_clock, - output_bpp); - - for (lane_count = limits->min_lane_count; - lane_count <= limits->max_lane_count; - lane_count <<= 1) { - for (clock = limits->min_clock; clock <= limits->max_clock; clock++) { - link_clock = intel_dp->common_rates[clock]; - link_avail = intel_dp_max_data_rate(link_clock, - lane_count); - - if (mode_rate <= link_avail) { - pipe_config->lane_count = lane_count; - pipe_config->pipe_bpp = bpp; - pipe_config->port_clock = link_clock; - - return 0; - } - } - } - } - - return -EINVAL; -} - static int intel_dp_dsc_compute_bpp(struct intel_dp *intel_dp, u8 dsc_max_bpc) { int i, num_bpc; @@ -1382,22 +1344,11 @@ intel_dp_compute_link_config(struct intel_encoder *encoder, intel_dp_can_bigjoiner(intel_dp)) pipe_config->bigjoiner = true; - if (intel_dp_is_edp(intel_dp)) - /* - * Optimize for fast and narrow. eDP 1.3 section 3.3 and eDP 1.4 - * section A.1: "It is recommended that the minimum number of - * lanes be used, using the minimum link rate allowed for that - * lane configuration." - * - * Note that we fall back to the max clock and lane count for eDP - * panels that fail with the fast optimal settings (see - * intel_dp->use_max_params), in which case the fast vs. wide - * choice doesn't matter. - */ - ret = intel_dp_compute_link_config_fast(intel_dp, pipe_config, &limits); - else - /* Optimize for slow and wide. */ - ret = intel_dp_compute_link_config_wide(intel_dp, pipe_config, &limits); + /* + * Optimize for slow and wide for everything, because there are some + * eDP 1.3 and 1.4 panels don't work well with fast and narrow. + */ + ret = intel_dp_compute_link_config_wide(intel_dp, pipe_config, &limits); /* enable compression if the mode doesn't fit available BW */ drm_dbg_kms(&i915->drm, "Force DSC en = %d\n", intel_dp->force_dsc_en); From 9b8a233bc294dd71d3c7d30692a78ab32f246a0f Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Fri, 30 Apr 2021 21:30:55 +0530 Subject: [PATCH 146/263] btrfs: handle transaction start error in btrfs_fileattr_set Add error handling in btrfs_fileattr_set in case of an error while starting a transaction. This fixes btrfs/232 which otherwise used to fail with below signature on Power. btrfs/232 [ 1119.474650] run fstests btrfs/232 at 2021-04-21 02:21:22 <...> [ 1366.638585] BUG: Unable to handle kernel data access on read at 0xffffffffffffff86 [ 1366.638768] Faulting instruction address: 0xc0000000009a5c88 cpu 0x0: Vector: 380 (Data SLB Access) at [c000000014f177b0] pc: c0000000009a5c88: btrfs_update_root_times+0x58/0xc0 lr: c0000000009a5c84: btrfs_update_root_times+0x54/0xc0 <...> pid = 24881, comm = fsstress btrfs_update_inode+0xa0/0x140 btrfs_fileattr_set+0x5d0/0x6f0 vfs_fileattr_set+0x2a8/0x390 do_vfs_ioctl+0x1290/0x1ac0 sys_ioctl+0x6c/0x120 system_call_exception+0x3d4/0x410 system_call_common+0xec/0x278 Fixes: 97fc29775487 ("btrfs: convert to fileattr") Signed-off-by: Ritesh Harjani Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ee1dbabb5d3c4..98ecb70466bf3 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -259,6 +259,8 @@ int btrfs_fileattr_set(struct user_namespace *mnt_userns, if (!fa->flags_valid) { /* 1 item for the inode */ trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); goto update_flags; } From efed9a3337e341bd0989161b97453b52567bc59d Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Mon, 10 May 2021 17:05:35 -0700 Subject: [PATCH 147/263] kyber: fix out of bounds access when preempted __blk_mq_sched_bio_merge() gets the ctx and hctx for the current CPU and passes the hctx to ->bio_merge(). kyber_bio_merge() then gets the ctx for the current CPU again and uses that to get the corresponding Kyber context in the passed hctx. However, the thread may be preempted between the two calls to blk_mq_get_ctx(), and the ctx returned the second time may no longer correspond to the passed hctx. This "works" accidentally most of the time, but it can cause us to read garbage if the second ctx came from an hctx with more ctx's than the first one (i.e., if ctx->index_hw[hctx->type] > hctx->nr_ctx). This manifested as this UBSAN array index out of bounds error reported by Jakub: UBSAN: array-index-out-of-bounds in ../kernel/locking/qspinlock.c:130:9 index 13106 is out of range for type 'long unsigned int [128]' Call Trace: dump_stack+0xa4/0xe5 ubsan_epilogue+0x5/0x40 __ubsan_handle_out_of_bounds.cold.13+0x2a/0x34 queued_spin_lock_slowpath+0x476/0x480 do_raw_spin_lock+0x1c2/0x1d0 kyber_bio_merge+0x112/0x180 blk_mq_submit_bio+0x1f5/0x1100 submit_bio_noacct+0x7b0/0x870 submit_bio+0xc2/0x3a0 btrfs_map_bio+0x4f0/0x9d0 btrfs_submit_data_bio+0x24e/0x310 submit_one_bio+0x7f/0xb0 submit_extent_page+0xc4/0x440 __extent_writepage_io+0x2b8/0x5e0 __extent_writepage+0x28d/0x6e0 extent_write_cache_pages+0x4d7/0x7a0 extent_writepages+0xa2/0x110 do_writepages+0x8f/0x180 __writeback_single_inode+0x99/0x7f0 writeback_sb_inodes+0x34e/0x790 __writeback_inodes_wb+0x9e/0x120 wb_writeback+0x4d2/0x660 wb_workfn+0x64d/0xa10 process_one_work+0x53a/0xa80 worker_thread+0x69/0x5b0 kthread+0x20b/0x240 ret_from_fork+0x1f/0x30 Only Kyber uses the hctx, so fix it by passing the request_queue to ->bio_merge() instead. BFQ and mq-deadline just use that, and Kyber can map the queues itself to avoid the mismatch. Fixes: a6088845c2bf ("block: kyber: make kyber more friendly with merging") Reported-by: Jakub Kicinski Signed-off-by: Omar Sandoval Link: https://lore.kernel.org/r/c7598605401a48d5cfeadebb678abd10af22b83f.1620691329.git.osandov@fb.com Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 3 +-- block/blk-mq-sched.c | 8 +++++--- block/kyber-iosched.c | 5 +++-- block/mq-deadline.c | 3 +-- include/linux/elevator.h | 2 +- 5 files changed, 11 insertions(+), 10 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 0270cd7ca1658..59b2499d3f8be 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2263,10 +2263,9 @@ static void bfq_remove_request(struct request_queue *q, } -static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio, +static bool bfq_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { - struct request_queue *q = hctx->queue; struct bfq_data *bfqd = q->elevator->elevator_data; struct request *free = NULL; /* diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 42a365b1b9c0e..996a4b2f73aa9 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -358,14 +358,16 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { struct elevator_queue *e = q->elevator; - struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx); + struct blk_mq_ctx *ctx; + struct blk_mq_hw_ctx *hctx; bool ret = false; enum hctx_type type; if (e && e->type->ops.bio_merge) - return e->type->ops.bio_merge(hctx, bio, nr_segs); + return e->type->ops.bio_merge(q, bio, nr_segs); + ctx = blk_mq_get_ctx(q); + hctx = blk_mq_map_queue(q, bio->bi_opf, ctx); type = hctx->type; if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE) || list_empty_careful(&ctx->rq_lists[type])) diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index 8969e122f0811..81e3279ecd574 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -561,11 +561,12 @@ static void kyber_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) } } -static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio, +static bool kyber_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { + struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx); struct kyber_hctx_data *khd = hctx->sched_data; - struct blk_mq_ctx *ctx = blk_mq_get_ctx(hctx->queue); struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw[hctx->type]]; unsigned int sched_domain = kyber_sched_domain(bio->bi_opf); struct list_head *rq_list = &kcq->rq_list[sched_domain]; diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 04aded71ead27..8eea2cbf2bf4a 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -461,10 +461,9 @@ static int dd_request_merge(struct request_queue *q, struct request **rq, return ELEVATOR_NO_MERGE; } -static bool dd_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio, +static bool dd_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { - struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; struct request *free = NULL; bool ret; diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 1fe8e105b83bf..dcb2f9022c1df 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -34,7 +34,7 @@ struct elevator_mq_ops { void (*depth_updated)(struct blk_mq_hw_ctx *); bool (*allow_merge)(struct request_queue *, struct request *, struct bio *); - bool (*bio_merge)(struct blk_mq_hw_ctx *, struct bio *, unsigned int); + bool (*bio_merge)(struct request_queue *, struct bio *, unsigned int); int (*request_merge)(struct request_queue *q, struct request **, struct bio *); void (*request_merged)(struct request_queue *, struct request *, enum elv_merge); void (*requests_merged)(struct request_queue *, struct request *, struct request *); From 5e1f689913a4498e3081093670ef9d85b2c60920 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Apr 2021 14:18:53 +0200 Subject: [PATCH 148/263] nvme-multipath: fix double initialization of ANA state nvme_init_identify and thus nvme_mpath_init can be called multiple times and thus must not overwrite potentially initialized or in-use fields. Split out a helper for the basic initialization when the controller is initialized and make sure the init_identify path does not blindly change in-use data structures. Fixes: 0d0b660f214d ("nvme: add ANA support") Reported-by: Martin Wilck Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Hannes Reinecke --- drivers/nvme/host/core.c | 3 +- drivers/nvme/host/multipath.c | 55 ++++++++++++++++++----------------- drivers/nvme/host/nvme.h | 8 +++-- 3 files changed, 37 insertions(+), 29 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 522c9b229f80e..762125f2905f7 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2901,7 +2901,7 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl) ctrl->hmmaxd = le16_to_cpu(id->hmmaxd); } - ret = nvme_mpath_init(ctrl, id); + ret = nvme_mpath_init_identify(ctrl, id); if (ret < 0) goto out_free; @@ -4364,6 +4364,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, min(default_ps_max_latency_us, (unsigned long)S32_MAX)); nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device)); + nvme_mpath_init_ctrl(ctrl); return 0; out_free_name: diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 0551796517e61..deb14562c96ae 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -781,9 +781,18 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head) put_disk(head->disk); } -int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) +void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl) { - int error; + mutex_init(&ctrl->ana_lock); + timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0); + INIT_WORK(&ctrl->ana_work, nvme_ana_work); +} + +int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) +{ + size_t max_transfer_size = ctrl->max_hw_sectors << SECTOR_SHIFT; + size_t ana_log_size; + int error = 0; /* check if multipath is enabled and we have the capability */ if (!multipath || !ctrl->subsys || @@ -795,37 +804,31 @@ int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) ctrl->nanagrpid = le32_to_cpu(id->nanagrpid); ctrl->anagrpmax = le32_to_cpu(id->anagrpmax); - mutex_init(&ctrl->ana_lock); - timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0); - ctrl->ana_log_size = sizeof(struct nvme_ana_rsp_hdr) + - ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc); - ctrl->ana_log_size += ctrl->max_namespaces * sizeof(__le32); - - if (ctrl->ana_log_size > ctrl->max_hw_sectors << SECTOR_SHIFT) { + ana_log_size = sizeof(struct nvme_ana_rsp_hdr) + + ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc) + + ctrl->max_namespaces * sizeof(__le32); + if (ana_log_size > max_transfer_size) { dev_err(ctrl->device, - "ANA log page size (%zd) larger than MDTS (%d).\n", - ctrl->ana_log_size, - ctrl->max_hw_sectors << SECTOR_SHIFT); + "ANA log page size (%zd) larger than MDTS (%zd).\n", + ana_log_size, max_transfer_size); dev_err(ctrl->device, "disabling ANA support.\n"); - return 0; + goto out_uninit; } - - INIT_WORK(&ctrl->ana_work, nvme_ana_work); - kfree(ctrl->ana_log_buf); - ctrl->ana_log_buf = kmalloc(ctrl->ana_log_size, GFP_KERNEL); - if (!ctrl->ana_log_buf) { - error = -ENOMEM; - goto out; + if (ana_log_size > ctrl->ana_log_size) { + nvme_mpath_stop(ctrl); + kfree(ctrl->ana_log_buf); + ctrl->ana_log_buf = kmalloc(ctrl->ana_log_size, GFP_KERNEL); + if (!ctrl->ana_log_buf) + return -ENOMEM; } - + ctrl->ana_log_size = ana_log_size; error = nvme_read_ana_log(ctrl); if (error) - goto out_free_ana_log_buf; + goto out_uninit; return 0; -out_free_ana_log_buf: - kfree(ctrl->ana_log_buf); - ctrl->ana_log_buf = NULL; -out: + +out_uninit: + nvme_mpath_uninit(ctrl); return error; } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 05f31a2c64bb2..0015860ec12bf 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -712,7 +712,8 @@ void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl); int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head); void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id); void nvme_mpath_remove_disk(struct nvme_ns_head *head); -int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id); +int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id); +void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl); void nvme_mpath_uninit(struct nvme_ctrl *ctrl); void nvme_mpath_stop(struct nvme_ctrl *ctrl); bool nvme_mpath_clear_current_path(struct nvme_ns *ns); @@ -780,7 +781,10 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) static inline void nvme_trace_bio_complete(struct request *req) { } -static inline int nvme_mpath_init(struct nvme_ctrl *ctrl, +static inline void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl) +{ +} +static inline int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) { if (ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA) From 608a969046e6e0567d05a166be66c77d2dd8220b Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Thu, 6 May 2021 18:51:35 -0700 Subject: [PATCH 149/263] nvmet: fix inline bio check for bdev-ns When handling rw commands, for inline bio case we only consider transfer size. This works well when req->sg_cnt fits into the req->inline_bvec, but it will result in the warning in __bio_add_page() when req->sg_cnt > NVMET_MAX_INLINE_BVEC. Consider an I/O size 32768 and first page is not aligned to the page boundary, then I/O is split in following manner :- [ 2206.256140] nvmet: sg->length 3440 sg->offset 656 [ 2206.256144] nvmet: sg->length 4096 sg->offset 0 [ 2206.256148] nvmet: sg->length 4096 sg->offset 0 [ 2206.256152] nvmet: sg->length 4096 sg->offset 0 [ 2206.256155] nvmet: sg->length 4096 sg->offset 0 [ 2206.256159] nvmet: sg->length 4096 sg->offset 0 [ 2206.256163] nvmet: sg->length 4096 sg->offset 0 [ 2206.256166] nvmet: sg->length 4096 sg->offset 0 [ 2206.256170] nvmet: sg->length 656 sg->offset 0 Now the req->transfer_size == NVMET_MAX_INLINE_DATA_LEN i.e. 32768, but the req->sg_cnt is (9) > NVMET_MAX_INLINE_BIOVEC which is (8). This will result in the following warning message :- nvmet_bdev_execute_rw() bio_add_page() __bio_add_page() WARN_ON_ONCE(bio_full(bio, len)); This scenario is very hard to reproduce on the nvme-loop transport only with rw commands issued with the passthru IOCTL interface from the host application and the data buffer is allocated with the malloc() and not the posix_memalign(). Fixes: 73383adfad24 ("nvmet: don't split large I/Os unconditionally") Signed-off-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/io-cmd-bdev.c | 2 +- drivers/nvme/target/nvmet.h | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index 9a8b3726a37c4..429263ca9b978 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -258,7 +258,7 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req) sector = nvmet_lba_to_sect(req->ns, req->cmd->rw.slba); - if (req->transfer_len <= NVMET_MAX_INLINE_DATA_LEN) { + if (nvmet_use_inline_bvec(req)) { bio = &req->b.inline_bio; bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec)); } else { diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 5566ed403576e..d69a409515d65 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -616,4 +616,10 @@ static inline sector_t nvmet_lba_to_sect(struct nvmet_ns *ns, __le64 lba) return le64_to_cpu(lba) << (ns->blksize_shift - SECTOR_SHIFT); } +static inline bool nvmet_use_inline_bvec(struct nvmet_req *req) +{ + return req->transfer_len <= NVMET_MAX_INLINE_DATA_LEN && + req->sg_cnt <= NVMET_MAX_INLINE_BIOVEC; +} + #endif /* _NVMET_H */ From ab96de5def854d8fc51280b6a20597e64b14ac31 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Thu, 6 May 2021 18:51:36 -0700 Subject: [PATCH 150/263] nvmet: fix inline bio check for passthru When handling passthru commands, for inline bio allocation we only consider the transfer size. This works well when req->sg_cnt fits into the req->inline_bvec, but it will result in the early return from bio_add_hw_page() when req->sg_cnt > NVMET_MAX_INLINE_BVEC. Consider an I/O of size 32768 and first buffer is not aligned to the page boundary, then I/O is split in following manner :- [ 2206.256140] nvmet: sg->length 3440 sg->offset 656 [ 2206.256144] nvmet: sg->length 4096 sg->offset 0 [ 2206.256148] nvmet: sg->length 4096 sg->offset 0 [ 2206.256152] nvmet: sg->length 4096 sg->offset 0 [ 2206.256155] nvmet: sg->length 4096 sg->offset 0 [ 2206.256159] nvmet: sg->length 4096 sg->offset 0 [ 2206.256163] nvmet: sg->length 4096 sg->offset 0 [ 2206.256166] nvmet: sg->length 4096 sg->offset 0 [ 2206.256170] nvmet: sg->length 656 sg->offset 0 Now the req->transfer_size == NVMET_MAX_INLINE_DATA_LEN i.e. 32768, but the req->sg_cnt is (9) > NVMET_MAX_INLINE_BIOVEC which is (8). This will result in early return in the following code path :- nvmet_bdev_execute_rw() bio_add_pc_page() bio_add_hw_page() if (bio_full(bio, len)) return 0; Use previously introduced helper nvmet_use_inline_bvec() to consider req->sg_cnt when using inline bio. This only affects nvme-loop transport. Fixes: dab3902b19a0 ("nvmet: use inline bio for passthru fast path") Signed-off-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/passthru.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index 2798944899b73..39b1473f7204e 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -194,7 +194,7 @@ static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq) if (req->sg_cnt > BIO_MAX_VECS) return -EINVAL; - if (req->transfer_len <= NVMET_MAX_INLINE_DATA_LEN) { + if (nvmet_use_inline_bvec(req)) { bio = &req->p.inline_bio; bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec)); } else { From 8cc365f9559b86802afc0208389f5c8d46b4ad61 Mon Sep 17 00:00:00 2001 From: Michal Kalderon Date: Thu, 6 May 2021 10:08:19 +0300 Subject: [PATCH 151/263] nvmet-rdma: Fix NULL deref when SEND is completed with error When running some traffic and taking down the link on peer, a retry counter exceeded error is received. This leads to nvmet_rdma_error_comp which tried accessing the cq_context to obtain the queue. The cq_context is no longer valid after the fix to use shared CQ mechanism and should be obtained similar to how it is obtained in other functions from the wc->qp. [ 905.786331] nvmet_rdma: SEND for CQE 0x00000000e3337f90 failed with status transport retry counter exceeded (12). [ 905.832048] BUG: unable to handle kernel NULL pointer dereference at 0000000000000048 [ 905.839919] PGD 0 P4D 0 [ 905.842464] Oops: 0000 1 SMP NOPTI [ 905.846144] CPU: 13 PID: 1557 Comm: kworker/13:1H Kdump: loaded Tainted: G OE --------- - - 4.18.0-304.el8.x86_64 #1 [ 905.872135] RIP: 0010:nvmet_rdma_error_comp+0x5/0x1b [nvmet_rdma] [ 905.878259] Code: 19 4f c0 e8 89 b3 a5 f6 e9 5b e0 ff ff 0f b7 75 14 4c 89 ea 48 c7 c7 08 1a 4f c0 e8 71 b3 a5 f6 e9 4b e0 ff ff 0f 1f 44 00 00 <48> 8b 47 48 48 85 c0 74 08 48 89 c7 e9 98 bf 49 00 e9 c3 e3 ff ff [ 905.897135] RSP: 0018:ffffab601c45fe28 EFLAGS: 00010246 [ 905.902387] RAX: 0000000000000065 RBX: ffff9e729ea2f800 RCX: 0000000000000000 [ 905.909558] RDX: 0000000000000000 RSI: ffff9e72df9567c8 RDI: 0000000000000000 [ 905.916731] RBP: ffff9e729ea2b400 R08: 000000000000074d R09: 0000000000000074 [ 905.923903] R10: 0000000000000000 R11: ffffab601c45fcc0 R12: 0000000000000010 [ 905.931074] R13: 0000000000000000 R14: 0000000000000010 R15: ffff9e729ea2f400 [ 905.938247] FS: 0000000000000000(0000) GS:ffff9e72df940000(0000) knlGS:0000000000000000 [ 905.938249] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 905.950067] nvmet_rdma: SEND for CQE 0x00000000c7356cca failed with status transport retry counter exceeded (12). [ 905.961855] CR2: 0000000000000048 CR3: 000000678d010004 CR4: 00000000007706e0 [ 905.961855] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 905.961856] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 905.961857] PKRU: 55555554 [ 906.010315] Call Trace: [ 906.012778] __ib_process_cq+0x89/0x170 [ib_core] [ 906.017509] ib_cq_poll_work+0x26/0x80 [ib_core] [ 906.022152] process_one_work+0x1a7/0x360 [ 906.026182] ? create_worker+0x1a0/0x1a0 [ 906.030123] worker_thread+0x30/0x390 [ 906.033802] ? create_worker+0x1a0/0x1a0 [ 906.037744] kthread+0x116/0x130 [ 906.040988] ? kthread_flush_work_fn+0x10/0x10 [ 906.045456] ret_from_fork+0x1f/0x40 Fixes: ca0f1a8055be2 ("nvmet-rdma: use new shared CQ mechanism") Signed-off-by: Shai Malin Signed-off-by: Michal Kalderon Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/rdma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 6c1f3ab7649c7..7d607f435e366 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -700,7 +700,7 @@ static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc) { struct nvmet_rdma_rsp *rsp = container_of(wc->wr_cqe, struct nvmet_rdma_rsp, send_cqe); - struct nvmet_rdma_queue *queue = cq->cq_context; + struct nvmet_rdma_queue *queue = wc->qp->qp_context; nvmet_rdma_release_rsp(rsp); @@ -786,7 +786,7 @@ static void nvmet_rdma_write_data_done(struct ib_cq *cq, struct ib_wc *wc) { struct nvmet_rdma_rsp *rsp = container_of(wc->wr_cqe, struct nvmet_rdma_rsp, write_cqe); - struct nvmet_rdma_queue *queue = cq->cq_context; + struct nvmet_rdma_queue *queue = wc->qp->qp_context; struct rdma_cm_id *cm_id = rsp->queue->cm_id; u16 status; From 3651aaacd10b2f8cee3780c490fc2df55bd4f543 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 10 May 2021 12:15:36 -0700 Subject: [PATCH 152/263] nvmet: demote discovery cmd parse err msg to debug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Host can send invalid commands and flood the target with error messages for the discovery controller. Demote the error message from pr_err() to pr_debug( in nvmet_parse_discovery_cmd().  Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/discovery.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c index 4845d12e374ac..fc3645fc2c249 100644 --- a/drivers/nvme/target/discovery.c +++ b/drivers/nvme/target/discovery.c @@ -379,7 +379,7 @@ u16 nvmet_parse_discovery_cmd(struct nvmet_req *req) req->execute = nvmet_execute_disc_identify; return 0; default: - pr_err("unhandled cmd %d\n", cmd->common.opcode); + pr_debug("unhandled cmd %d\n", cmd->common.opcode); req->error_loc = offsetof(struct nvme_common_command, opcode); return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; } From 4c2dab2bf5ace0ddc07ca7f04a7ba32fc3b23492 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 10 May 2021 12:15:37 -0700 Subject: [PATCH 153/263] nvmet: use helper to remove the duplicate code Use the helper nvmet_report_invalid_opcode() to report invalid opcode so we can remove the duplicate code. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/admin-cmd.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index e7a367cf6d367..dcd49a72f2f3c 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -975,10 +975,7 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req) case nvme_admin_keep_alive: req->execute = nvmet_execute_keep_alive; return 0; + default: + return nvmet_report_invalid_opcode(req); } - - pr_debug("unhandled cmd %d on qid %d\n", cmd->common.opcode, - req->sq->qid); - req->error_loc = offsetof(struct nvme_common_command, opcode); - return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; } From 7a4ffd20ec6d31dfde2cc5608851e5109ffed7c9 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 10 May 2021 12:15:38 -0700 Subject: [PATCH 154/263] nvmet: demote fabrics cmd parse err msg to debug Host can send invalid commands and flood the target with error messages. Demote the error message from pr_err() to pr_debug() in nvmet_parse_fabrics_cmd() and nvmet_parse_connect_cmd(). Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fabrics-cmd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index 1420a8e3e0b10..7d0f3523fdab2 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -94,7 +94,7 @@ u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req) req->execute = nvmet_execute_prop_get; break; default: - pr_err("received unknown capsule type 0x%x\n", + pr_debug("received unknown capsule type 0x%x\n", cmd->fabrics.fctype); req->error_loc = offsetof(struct nvmf_common_command, fctype); return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; @@ -284,13 +284,13 @@ u16 nvmet_parse_connect_cmd(struct nvmet_req *req) struct nvme_command *cmd = req->cmd; if (!nvme_is_fabrics(cmd)) { - pr_err("invalid command 0x%x on unconnected queue.\n", + pr_debug("invalid command 0x%x on unconnected queue.\n", cmd->fabrics.opcode); req->error_loc = offsetof(struct nvme_common_command, opcode); return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; } if (cmd->fabrics.fctype != nvme_fabrics_type_connect) { - pr_err("invalid capsule type 0x%x on unconnected queue.\n", + pr_debug("invalid capsule type 0x%x on unconnected queue.\n", cmd->fabrics.fctype); req->error_loc = offsetof(struct nvmf_common_command, fctype); return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; From 918d9c77791cc8267b5b5ab556c868dfa57e0d93 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 11 May 2021 17:01:28 +0200 Subject: [PATCH 155/263] docs: cdrom-standard.rst: get rid of uneeded UTF-8 chars This file was converted from a LaTeX one. The conversion used some UTF-8 characters at the literal blocks. Replace them by normal ASCII characters. Signed-off-by: Mauro Carvalho Chehab Acked-by: Jens Axboe Link: https://lore.kernel.org/r/79c3f482da17ea48d69b6e6ad1b7fb102b9dd7bf.1620744606.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/cdrom/cdrom-standard.rst | 30 +++++++++++++------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/Documentation/cdrom/cdrom-standard.rst b/Documentation/cdrom/cdrom-standard.rst index 70500b189cc84..5845960ca3821 100644 --- a/Documentation/cdrom/cdrom-standard.rst +++ b/Documentation/cdrom/cdrom-standard.rst @@ -146,18 +146,18 @@ with the kernel as a block device by registering the following general *struct file_operations*:: struct file_operations cdrom_fops = { - NULL, /∗ lseek ∗/ - block _read , /∗ read—general block-dev read ∗/ - block _write, /∗ write—general block-dev write ∗/ - NULL, /∗ readdir ∗/ - NULL, /∗ select ∗/ - cdrom_ioctl, /∗ ioctl ∗/ - NULL, /∗ mmap ∗/ - cdrom_open, /∗ open ∗/ - cdrom_release, /∗ release ∗/ - NULL, /∗ fsync ∗/ - NULL, /∗ fasync ∗/ - NULL /∗ revalidate ∗/ + NULL, /* lseek */ + block _read , /* read--general block-dev read */ + block _write, /* write--general block-dev write */ + NULL, /* readdir */ + NULL, /* select */ + cdrom_ioctl, /* ioctl */ + NULL, /* mmap */ + cdrom_open, /* open */ + cdrom_release, /* release */ + NULL, /* fsync */ + NULL, /* fasync */ + NULL /* revalidate */ }; Every active CD-ROM device shares this *struct*. The routines @@ -250,12 +250,12 @@ The drive-specific, minor-like information that is registered with `cdrom.c`, currently contains the following fields:: struct cdrom_device_info { - const struct cdrom_device_ops * ops; /* device operations for this major */ + const struct cdrom_device_ops * ops; /* device operations for this major */ struct list_head list; /* linked list of all device_info */ struct gendisk * disk; /* matching block layer disk */ void * handle; /* driver-dependent data */ - int mask; /* mask of capability: disables them */ + int mask; /* mask of capability: disables them */ int speed; /* maximum speed for reading data */ int capacity; /* number of discs in a jukebox */ @@ -569,7 +569,7 @@ the *CDC_CLOSE_TRAY* bit in *mask*. In the file `cdrom.c` you will encounter many constructions of the type:: - if (cdo->capability & ∼cdi->mask & CDC _⟨capability⟩) ... + if (cdo->capability & ~cdi->mask & CDC _) ... There is no *ioctl* to set the mask... The reason is that I think it is better to control the **behavior** rather than the From 8d3926c09e043448d4d26896b8225943f12d0933 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 11 May 2021 17:01:29 +0200 Subject: [PATCH 156/263] docs: ABI: remove a meaningless UTF-8 character MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Those two files have this character: - U+00ac ('¬'): NOT SIGN at the end of the first line, apparently for no reason. Drop them. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/6cd3f0b47568fecb7889fd18d1d744c3aaf73866.1620744606.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/ABI/obsolete/sysfs-kernel-fadump_registered | 2 +- Documentation/ABI/obsolete/sysfs-kernel-fadump_release_mem | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/ABI/obsolete/sysfs-kernel-fadump_registered b/Documentation/ABI/obsolete/sysfs-kernel-fadump_registered index 0360be39c98e9..dae880b1a5d5d 100644 --- a/Documentation/ABI/obsolete/sysfs-kernel-fadump_registered +++ b/Documentation/ABI/obsolete/sysfs-kernel-fadump_registered @@ -1,4 +1,4 @@ -This ABI is renamed and moved to a new location /sys/kernel/fadump/registered.¬ +This ABI is renamed and moved to a new location /sys/kernel/fadump/registered. What: /sys/kernel/fadump_registered Date: Feb 2012 diff --git a/Documentation/ABI/obsolete/sysfs-kernel-fadump_release_mem b/Documentation/ABI/obsolete/sysfs-kernel-fadump_release_mem index 6ce0b129ab12d..ca2396edb5f10 100644 --- a/Documentation/ABI/obsolete/sysfs-kernel-fadump_release_mem +++ b/Documentation/ABI/obsolete/sysfs-kernel-fadump_release_mem @@ -1,4 +1,4 @@ -This ABI is renamed and moved to a new location /sys/kernel/fadump/release_mem.¬ +This ABI is renamed and moved to a new location /sys/kernel/fadump/release_mem. What: /sys/kernel/fadump_release_mem Date: Feb 2012 From 6f3bceba03b4f18e0b83261e2fb761e0ad5da625 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 11 May 2021 17:01:30 +0200 Subject: [PATCH 157/263] docs: ABI: remove some spurious characters The KernelVersion tag contains some spurious UTF-8 characters for no reason. Drop them. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/6d774ad6cb3795a177309503a39f8f1b5e309d64.1620744606.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/ABI/testing/sysfs-module | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-module b/Documentation/ABI/testing/sysfs-module index a485434d2a0fb..88bddf192ceb7 100644 --- a/Documentation/ABI/testing/sysfs-module +++ b/Documentation/ABI/testing/sysfs-module @@ -37,13 +37,13 @@ Description: Maximum time allowed for periodic transfers per microframe (μs) What: /sys/module/*/{coresize,initsize} Date: Jan 2012 -KernelVersion:»·3.3 +KernelVersion: 3.3 Contact: Kay Sievers Description: Module size in bytes. What: /sys/module/*/taint Date: Jan 2012 -KernelVersion:»·3.3 +KernelVersion: 3.3 Contact: Kay Sievers Description: Module taint flags: == ===================== From d1f2722d5357d7a5138b1be8bd64946f0a14c81e Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 11 May 2021 17:01:31 +0200 Subject: [PATCH 158/263] docs: hwmon: tmp103.rst: fix bad usage of UTF-8 chars MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While UTF-8 characters can be used at the Linux documentation, the best is to use them only when ASCII doesn't offer a good replacement. So, replace the occurences of the following UTF-8 characters: - U+2013 ('–'): EN DASH In this specific case, EN DASH was used instead of a minus sign. So, replace it by a single hyphen. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/73b3c7c1eef5c12ddc941624d23689313bd56529.1620744606.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/hwmon/tmp103.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/hwmon/tmp103.rst b/Documentation/hwmon/tmp103.rst index e195a7d14309a..b3ef81475cf8b 100644 --- a/Documentation/hwmon/tmp103.rst +++ b/Documentation/hwmon/tmp103.rst @@ -21,10 +21,10 @@ Description The TMP103 is a digital output temperature sensor in a four-ball wafer chip-scale package (WCSP). The TMP103 is capable of reading temperatures to a resolution of 1°C. The TMP103 is specified for -operation over a temperature range of –40°C to +125°C. +operation over a temperature range of -40°C to +125°C. Resolution: 8 Bits -Accuracy: ±1°C Typ (–10°C to +100°C) +Accuracy: ±1°C Typ (-10°C to +100°C) The driver provides the common sysfs-interface for temperatures (see Documentation/hwmon/sysfs-interface.rst under Temperatures). From 5e716ec68b4a75a84e28c0efa68db613deb64981 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 11 May 2021 17:01:32 +0200 Subject: [PATCH 159/263] docs: networking: device_drivers: fix bad usage of UTF-8 chars MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Probably because the original file was pre-processed by some tool, both i40e.rst and iavf.rst files are using this character: - U+2013 ('–'): EN DASH meaning an hyphen when calling a command line application, which is obviously wrong. So, replace them by an hyphen, ensuring that it will be properly displayed as literals when building the documentation. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/95eb2a48d0ca3528780ce0dfce64359977fa8cb3.1620744606.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- .../networking/device_drivers/ethernet/intel/i40e.rst | 4 ++-- .../networking/device_drivers/ethernet/intel/iavf.rst | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/networking/device_drivers/ethernet/intel/i40e.rst b/Documentation/networking/device_drivers/ethernet/intel/i40e.rst index 8a9b18573688d..2d3f6bd969a2b 100644 --- a/Documentation/networking/device_drivers/ethernet/intel/i40e.rst +++ b/Documentation/networking/device_drivers/ethernet/intel/i40e.rst @@ -173,7 +173,7 @@ Director rule is added from ethtool (Sideband filter), ATR is turned off by the driver. To re-enable ATR, the sideband can be disabled with the ethtool -K option. For example:: - ethtool –K [adapter] ntuple [off|on] + ethtool -K [adapter] ntuple [off|on] If sideband is re-enabled after ATR is re-enabled, ATR remains enabled until a TCP-IP flow is added. When all TCP-IP sideband rules are deleted, ATR is @@ -688,7 +688,7 @@ shaper bw_rlimit: for each tc, sets minimum and maximum bandwidth rates. Totals must be equal or less than port speed. For example: min_rate 1Gbit 3Gbit: Verify bandwidth limit using network -monitoring tools such as ifstat or sar –n DEV [interval] [number of samples] +monitoring tools such as `ifstat` or `sar -n DEV [interval] [number of samples]` 2. Enable HW TC offload on interface:: diff --git a/Documentation/networking/device_drivers/ethernet/intel/iavf.rst b/Documentation/networking/device_drivers/ethernet/intel/iavf.rst index 52e037b11c979..25330b7b5168d 100644 --- a/Documentation/networking/device_drivers/ethernet/intel/iavf.rst +++ b/Documentation/networking/device_drivers/ethernet/intel/iavf.rst @@ -179,7 +179,7 @@ shaper bw_rlimit: for each tc, sets minimum and maximum bandwidth rates. Totals must be equal or less than port speed. For example: min_rate 1Gbit 3Gbit: Verify bandwidth limit using network -monitoring tools such as ifstat or sar –n DEV [interval] [number of samples] +monitoring tools such as ``ifstat`` or ``sar -n DEV [interval] [number of samples]`` NOTE: Setting up channels via ethtool (ethtool -L) is not supported when the From 7240cd200541543008a7ce4fcaf2ba5a5556128f Mon Sep 17 00:00:00 2001 From: Desmond Cheong Zhi Xi Date: Tue, 11 May 2021 09:49:37 -0400 Subject: [PATCH 160/263] Remove link to nonexistent rocket driver docs The rocket driver and documentation were removed in this commit, but the corresponding entry in index.rst was not removed. Signed-off-by: Desmond Cheong Zhi Xi Fixes: 3b00b6af7a5b ("tty: rocket, remove the driver") Link: https://lore.kernel.org/r/20210511134937.2442291-1-desmondcheongzx@gmail.com Signed-off-by: Jonathan Corbet --- Documentation/driver-api/serial/index.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/Documentation/driver-api/serial/index.rst b/Documentation/driver-api/serial/index.rst index 21351b8c95a4d..8f7d7af3b90b1 100644 --- a/Documentation/driver-api/serial/index.rst +++ b/Documentation/driver-api/serial/index.rst @@ -19,7 +19,6 @@ Serial drivers moxa-smartio n_gsm - rocket serial-iso7816 serial-rs485 From 875d598db60ac81e768fdfd2c589f6209038488b Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Tue, 11 May 2021 18:34:13 +0200 Subject: [PATCH 161/263] MAINTAINERS: Update address for Emma Anholt Reviewed-by: Emma Anholt Signed-off-by: Daniel Vetter --- MAINTAINERS | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index bd7aff0c120f2..38a1e3bf5af08 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5639,7 +5639,7 @@ T: git git://anongit.freedesktop.org/drm/drm-misc F: drivers/gpu/drm/sun4i/sun8i* DRM DRIVER FOR ARM PL111 CLCD -M: Eric Anholt +M: Emma Anholt S: Supported T: git git://anongit.freedesktop.org/drm/drm-misc F: drivers/gpu/drm/pl111/ @@ -5719,7 +5719,7 @@ T: git git://anongit.freedesktop.org/drm/drm-misc F: drivers/gpu/drm/tiny/gm12u320.c DRM DRIVER FOR HX8357D PANELS -M: Eric Anholt +M: Emma Anholt S: Maintained T: git git://anongit.freedesktop.org/drm/drm-misc F: Documentation/devicetree/bindings/display/himax,hx8357d.txt @@ -6177,7 +6177,7 @@ F: Documentation/devicetree/bindings/display/ti/ F: drivers/gpu/drm/omapdrm/ DRM DRIVERS FOR V3D -M: Eric Anholt +M: Emma Anholt S: Supported T: git git://anongit.freedesktop.org/drm/drm-misc F: Documentation/devicetree/bindings/gpu/brcm,bcm-v3d.yaml @@ -6185,7 +6185,7 @@ F: drivers/gpu/drm/v3d/ F: include/uapi/drm/v3d_drm.h DRM DRIVERS FOR VC4 -M: Eric Anholt +M: Emma Anholt M: Maxime Ripard S: Supported T: git git://github.com/anholt/linux From e09784a8a751e539dffc94d43bc917b0ac1e934a Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Tue, 11 May 2021 03:45:16 +0200 Subject: [PATCH 162/263] alarmtimer: Check RTC features instead of ops RTC drivers used to leave .set_alarm() NULL in order to signal the RTC device doesn't support alarms. The drivers are now clearing the RTC_FEATURE_ALARM bit for that purpose in order to keep the rtc_class_ops structure const. So now, .set_alarm() is set unconditionally and this possibly causes the alarmtimer code to select an RTC device that doesn't support alarms. Test RTC_FEATURE_ALARM instead of relying on ops->set_alarm to determine whether alarms are available. Fixes: 7ae41220ef58 ("rtc: introduce features bitfield") Signed-off-by: Alexandre Belloni Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210511014516.563031-1-alexandre.belloni@bootlin.com --- kernel/time/alarmtimer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index bea9d08b16988..5897828b9d7ed 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -92,7 +92,7 @@ static int alarmtimer_rtc_add_device(struct device *dev, if (rtcdev) return -EBUSY; - if (!rtc->ops->set_alarm) + if (!test_bit(RTC_FEATURE_ALARM, rtc->features)) return -1; if (!device_may_wakeup(rtc->dev.parent)) return -1; From 349c4d6c75d74b62d8e39913b40bd06117b85e4a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sun, 9 May 2021 21:53:03 -0700 Subject: [PATCH 163/263] f2fs: avoid null pointer access when handling IPU error Unable to handle kernel NULL pointer dereference at virtual address 000000000000001a pc : f2fs_inplace_write_data+0x144/0x208 lr : f2fs_inplace_write_data+0x134/0x208 Call trace: f2fs_inplace_write_data+0x144/0x208 f2fs_do_write_data_page+0x270/0x770 f2fs_write_single_data_page+0x47c/0x830 __f2fs_write_data_pages+0x444/0x98c f2fs_write_data_pages.llvm.16514453770497736882+0x2c/0x38 do_writepages+0x58/0x118 __writeback_single_inode+0x44/0x300 writeback_sb_inodes+0x4b8/0x9c8 wb_writeback+0x148/0x42c wb_do_writeback+0xc8/0x390 wb_workfn+0xb0/0x2f4 process_one_work+0x1fc/0x444 worker_thread+0x268/0x4b4 kthread+0x13c/0x158 ret_from_fork+0x10/0x18 Fixes: 955772787667 ("f2fs: drop inplace IO if fs status is abnormal") Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c605415840b59..51dc79fad4fe2 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3574,12 +3574,12 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) return err; drop_bio: - if (fio->bio) { + if (fio->bio && *(fio->bio)) { struct bio *bio = *(fio->bio); bio->bi_status = BLK_STS_IOERR; bio_endio(bio); - fio->bio = NULL; + *(fio->bio) = NULL; } return err; } From a753103909a7e3d22147505d944da3d20759e1a5 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 6 May 2021 12:11:14 -0700 Subject: [PATCH 164/263] f2fs: support iflag change given the mask In f2fs_fileattr_set(), if (!fa->flags_valid) mask &= FS_COMMON_FL; In this case, we can set supported flags by mask only instead of BUG_ON. /* Flags shared betwen flags/xflags */ (FS_SYNC_FL | FS_IMMUTABLE_FL | FS_APPEND_FL | \ FS_NODUMP_FL | FS_NOATIME_FL | FS_DAX_FL | \ FS_PROJINHERIT_FL) Fixes: 9b1bb01c8ae7 ("f2fs: convert to fileattr") Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 44a4650aea7b7..ceb575f99048c 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1817,7 +1817,8 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask) struct f2fs_inode_info *fi = F2FS_I(inode); u32 masked_flags = fi->i_flags & mask; - f2fs_bug_on(F2FS_I_SB(inode), (iflags & ~mask)); + /* mask can be shrunk by flags_valid selector */ + iflags &= mask; /* Is it quota file? Do not allow user to mess with it */ if (IS_NOQUOTA(inode)) From a12cc5b423d4f36dc1a1ea3911e49cf9dff43898 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 6 May 2021 17:00:43 +0800 Subject: [PATCH 165/263] f2fs: compress: fix to free compress page correctly In error path of f2fs_write_compressed_pages(), it needs to call f2fs_compress_free_page() to release temporary page. Fixes: 5e6bbde95982 ("f2fs: introduce mempool for {,de}compress intermediate page allocation") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 53b13787eb2c8..2acaefa100364 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1372,7 +1372,8 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, for (i = 0; i < cc->nr_cpages; i++) { if (!cc->cpages[i]) continue; - f2fs_put_page(cc->cpages[i], 1); + f2fs_compress_free_page(cc->cpages[i]); + cc->cpages[i] = NULL; } out_put_cic: kmem_cache_free(cic_entry_slab, cic); From a949dc5f2c5cfe0c910b664650f45371254c0744 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 10 May 2021 17:30:31 +0800 Subject: [PATCH 166/263] f2fs: compress: fix race condition of overwrite vs truncate pos_fsstress testcase complains a panic as belew: ------------[ cut here ]------------ kernel BUG at fs/f2fs/compress.c:1082! invalid opcode: 0000 [#1] SMP PTI CPU: 4 PID: 2753477 Comm: kworker/u16:2 Tainted: G OE 5.12.0-rc1-custom #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014 Workqueue: writeback wb_workfn (flush-252:16) RIP: 0010:prepare_compress_overwrite+0x4c0/0x760 [f2fs] Call Trace: f2fs_prepare_compress_overwrite+0x5f/0x80 [f2fs] f2fs_write_cache_pages+0x468/0x8a0 [f2fs] f2fs_write_data_pages+0x2a4/0x2f0 [f2fs] do_writepages+0x38/0xc0 __writeback_single_inode+0x44/0x2a0 writeback_sb_inodes+0x223/0x4d0 __writeback_inodes_wb+0x56/0xf0 wb_writeback+0x1dd/0x290 wb_workfn+0x309/0x500 process_one_work+0x220/0x3c0 worker_thread+0x53/0x420 kthread+0x12f/0x150 ret_from_fork+0x22/0x30 The root cause is truncate() may race with overwrite as below, so that one reference count left in page can not guarantee the page attaching in mapping tree all the time, after truncation, later find_lock_page() may return NULL pointer. - prepare_compress_overwrite - f2fs_pagecache_get_page - unlock_page - f2fs_setattr - truncate_setsize - truncate_inode_page - delete_from_page_cache - find_lock_page Fix this by avoiding referencing updated page. Fixes: 4c8ff7095bef ("f2fs: support data compression") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 35 ++++++++++++----------------------- 1 file changed, 12 insertions(+), 23 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 2acaefa100364..79348bc56e351 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -117,19 +117,6 @@ static void f2fs_unlock_rpages(struct compress_ctx *cc, int len) f2fs_drop_rpages(cc, len, true); } -static void f2fs_put_rpages_mapping(struct address_space *mapping, - pgoff_t start, int len) -{ - int i; - - for (i = 0; i < len; i++) { - struct page *page = find_get_page(mapping, start + i); - - put_page(page); - put_page(page); - } -} - static void f2fs_put_rpages_wbc(struct compress_ctx *cc, struct writeback_control *wbc, bool redirty, int unlock) { @@ -1036,7 +1023,7 @@ static int prepare_compress_overwrite(struct compress_ctx *cc, } if (PageUptodate(page)) - unlock_page(page); + f2fs_put_page(page, 1); else f2fs_compress_ctx_add_page(cc, page); } @@ -1046,32 +1033,34 @@ static int prepare_compress_overwrite(struct compress_ctx *cc, ret = f2fs_read_multi_pages(cc, &bio, cc->cluster_size, &last_block_in_bio, false, true); + f2fs_put_rpages(cc); f2fs_destroy_compress_ctx(cc); if (ret) - goto release_pages; + goto out; if (bio) f2fs_submit_bio(sbi, bio, DATA); ret = f2fs_init_compress_ctx(cc); if (ret) - goto release_pages; + goto out; } for (i = 0; i < cc->cluster_size; i++) { f2fs_bug_on(sbi, cc->rpages[i]); page = find_lock_page(mapping, start_idx + i); - f2fs_bug_on(sbi, !page); + if (!page) { + /* page can be truncated */ + goto release_and_retry; + } f2fs_wait_on_page_writeback(page, DATA, true, true); - f2fs_compress_ctx_add_page(cc, page); - f2fs_put_page(page, 0); if (!PageUptodate(page)) { +release_and_retry: + f2fs_put_rpages(cc); f2fs_unlock_rpages(cc, i + 1); - f2fs_put_rpages_mapping(mapping, start_idx, - cc->cluster_size); f2fs_destroy_compress_ctx(cc); goto retry; } @@ -1103,10 +1092,10 @@ static int prepare_compress_overwrite(struct compress_ctx *cc, } unlock_pages: + f2fs_put_rpages(cc); f2fs_unlock_rpages(cc, i); -release_pages: - f2fs_put_rpages_mapping(mapping, start_idx, i); f2fs_destroy_compress_ctx(cc); +out: return ret; } From 8bfbfb0ddd706b1ce2e89259ecc45f192c0ec2bf Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 10 May 2021 17:30:32 +0800 Subject: [PATCH 167/263] f2fs: compress: fix to assign cc.cluster_idx correctly In f2fs_destroy_compress_ctx(), after f2fs_destroy_compress_ctx(), cc.cluster_idx will be cleared w/ NULL_CLUSTER, f2fs_cluster_blocks() may check wrong cluster metadata, fix it. Fixes: 4c8ff7095bef ("f2fs: support data compression") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 17 +++++++++-------- fs/f2fs/data.c | 6 +++--- fs/f2fs/f2fs.h | 2 +- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 79348bc56e351..925a5ca3744a9 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -145,13 +145,14 @@ int f2fs_init_compress_ctx(struct compress_ctx *cc) return cc->rpages ? 0 : -ENOMEM; } -void f2fs_destroy_compress_ctx(struct compress_ctx *cc) +void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse) { page_array_free(cc->inode, cc->rpages, cc->cluster_size); cc->rpages = NULL; cc->nr_rpages = 0; cc->nr_cpages = 0; - cc->cluster_idx = NULL_CLUSTER; + if (!reuse) + cc->cluster_idx = NULL_CLUSTER; } void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page) @@ -1034,7 +1035,7 @@ static int prepare_compress_overwrite(struct compress_ctx *cc, ret = f2fs_read_multi_pages(cc, &bio, cc->cluster_size, &last_block_in_bio, false, true); f2fs_put_rpages(cc); - f2fs_destroy_compress_ctx(cc); + f2fs_destroy_compress_ctx(cc, true); if (ret) goto out; if (bio) @@ -1061,7 +1062,7 @@ static int prepare_compress_overwrite(struct compress_ctx *cc, release_and_retry: f2fs_put_rpages(cc); f2fs_unlock_rpages(cc, i + 1); - f2fs_destroy_compress_ctx(cc); + f2fs_destroy_compress_ctx(cc, true); goto retry; } } @@ -1094,7 +1095,7 @@ static int prepare_compress_overwrite(struct compress_ctx *cc, unlock_pages: f2fs_put_rpages(cc); f2fs_unlock_rpages(cc, i); - f2fs_destroy_compress_ctx(cc); + f2fs_destroy_compress_ctx(cc, true); out: return ret; } @@ -1130,7 +1131,7 @@ bool f2fs_compress_write_end(struct inode *inode, void *fsdata, set_cluster_dirty(&cc); f2fs_put_rpages_wbc(&cc, NULL, false, 1); - f2fs_destroy_compress_ctx(&cc); + f2fs_destroy_compress_ctx(&cc, false); return first_index; } @@ -1350,7 +1351,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, f2fs_put_rpages(cc); page_array_free(cc->inode, cc->cpages, cc->nr_cpages); cc->cpages = NULL; - f2fs_destroy_compress_ctx(cc); + f2fs_destroy_compress_ctx(cc, false); return 0; out_destroy_crypt: @@ -1512,7 +1513,7 @@ int f2fs_write_multi_pages(struct compress_ctx *cc, err = f2fs_write_raw_pages(cc, submitted, wbc, io_type); f2fs_put_rpages_wbc(cc, wbc, false, 0); destroy_out: - f2fs_destroy_compress_ctx(cc); + f2fs_destroy_compress_ctx(cc, false); return err; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 96f1a354f89fd..33e56ae84e358 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2287,7 +2287,7 @@ static int f2fs_mpage_readpages(struct inode *inode, max_nr_pages, &last_block_in_bio, rac != NULL, false); - f2fs_destroy_compress_ctx(&cc); + f2fs_destroy_compress_ctx(&cc, false); if (ret) goto set_error_page; } @@ -2332,7 +2332,7 @@ static int f2fs_mpage_readpages(struct inode *inode, max_nr_pages, &last_block_in_bio, rac != NULL, false); - f2fs_destroy_compress_ctx(&cc); + f2fs_destroy_compress_ctx(&cc, false); } } #endif @@ -3033,7 +3033,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, } } if (f2fs_compressed_file(inode)) - f2fs_destroy_compress_ctx(&cc); + f2fs_destroy_compress_ctx(&cc, false); #endif if (retry) { index = 0; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 044878866ca34..c83d90125ebd9 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3956,7 +3956,7 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc); void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed); void f2fs_put_page_dic(struct page *page); int f2fs_init_compress_ctx(struct compress_ctx *cc); -void f2fs_destroy_compress_ctx(struct compress_ctx *cc); +void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse); void f2fs_init_compress_info(struct f2fs_sb_info *sbi); int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi); void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi); From a78339698ab1f43435fbe67fcd6de8f4f6eb9eec Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 6 May 2021 14:49:45 +0000 Subject: [PATCH 168/263] powerpc/interrupts: Fix kuep_unlock() call Same as kuap_user_restore(), kuep_unlock() has to be called when really returning to user, that is in interrupt_exit_user_prepare(), not in interrupt_exit_prepare(). Fixes: b5efec00b671 ("powerpc/32s: Move KUEP locking/unlocking in C") Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/b831e54a2579db24fbef836ed415588ce2b3e825.1620312573.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/interrupt.h | 2 -- arch/powerpc/kernel/interrupt.c | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h index 44cde2e129b88..c77e8f57ff062 100644 --- a/arch/powerpc/include/asm/interrupt.h +++ b/arch/powerpc/include/asm/interrupt.h @@ -153,8 +153,6 @@ static inline void interrupt_enter_prepare(struct pt_regs *regs, struct interrup */ static inline void interrupt_exit_prepare(struct pt_regs *regs, struct interrupt_state *state) { - if (user_mode(regs)) - kuep_unlock(); } static inline void interrupt_async_enter_prepare(struct pt_regs *regs, struct interrupt_state *state) diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c index e4559f8914eb7..ed6cebcb78475 100644 --- a/arch/powerpc/kernel/interrupt.c +++ b/arch/powerpc/kernel/interrupt.c @@ -427,6 +427,7 @@ notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs, unsigned /* Restore user access locks last */ kuap_user_restore(regs); + kuep_unlock(); return ret; } From 5d510ed78bcfcbbd3b3891cbe79cd7543bce1d05 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 6 May 2021 11:56:31 +0000 Subject: [PATCH 169/263] powerpc/syscall: Calling kuap_save_and_lock() is wrong kuap_save_and_lock() is only for interrupts inside kernel. system call are only from user, calling kuap_save_and_lock() is wrong. Fixes: c16728835eec ("powerpc/32: Manage KUAP in C") Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/332773775cf24a422105dee2d383fb8f04589045.1620302182.git.christophe.leroy@csgroup.eu --- arch/powerpc/kernel/interrupt.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c index ed6cebcb78475..e0938ba298f2a 100644 --- a/arch/powerpc/kernel/interrupt.c +++ b/arch/powerpc/kernel/interrupt.c @@ -34,9 +34,6 @@ notrace long system_call_exception(long r3, long r4, long r5, syscall_fn f; kuep_lock(); -#ifdef CONFIG_PPC32 - kuap_save_and_lock(regs); -#endif regs->orig_gpr3 = r3; From 2c8c89b95831f46a2fb31a8d0fef4601694023ce Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sat, 8 May 2021 20:14:52 +1000 Subject: [PATCH 170/263] powerpc/pseries: Fix hcall tracing recursion in pv queued spinlocks The paravit queued spinlock slow path adds itself to the queue then calls pv_wait to wait for the lock to become free. This is implemented by calling H_CONFER to donate cycles. When hcall tracing is enabled, this H_CONFER call can lead to a spin lock being taken in the tracing code, which will result in the lock to be taken again, which will also go to the slow path because it queues behind itself and so won't ever make progress. An example trace of a deadlock: __pv_queued_spin_lock_slowpath trace_clock_global ring_buffer_lock_reserve trace_event_buffer_lock_reserve trace_event_buffer_reserve trace_event_raw_event_hcall_exit __trace_hcall_exit plpar_hcall_norets_trace __pv_queued_spin_lock_slowpath trace_clock_global ring_buffer_lock_reserve trace_event_buffer_lock_reserve trace_event_buffer_reserve trace_event_raw_event_rcu_dyntick rcu_irq_exit irq_exit __do_irq call_do_irq do_IRQ hardware_interrupt_common_virt Fix this by introducing plpar_hcall_norets_notrace(), and using that to make SPLPAR virtual processor dispatching hcalls by the paravirt spinlock code. Signed-off-by: Nicholas Piggin Reviewed-by: Naveen N. Rao Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210508101455.1578318-2-npiggin@gmail.com --- arch/powerpc/include/asm/hvcall.h | 3 +++ arch/powerpc/include/asm/paravirt.h | 22 +++++++++++++++++++--- arch/powerpc/platforms/pseries/hvCall.S | 10 ++++++++++ arch/powerpc/platforms/pseries/lpar.c | 3 +-- 4 files changed, 33 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index 4430509060185..e3b29eda8074c 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -448,6 +448,9 @@ */ long plpar_hcall_norets(unsigned long opcode, ...); +/* Variant which does not do hcall tracing */ +long plpar_hcall_norets_notrace(unsigned long opcode, ...); + /** * plpar_hcall: - Make a pseries hypervisor call * @opcode: The hypervisor call to make. diff --git a/arch/powerpc/include/asm/paravirt.h b/arch/powerpc/include/asm/paravirt.h index 5d1726bb28e79..bcb7b5f917be6 100644 --- a/arch/powerpc/include/asm/paravirt.h +++ b/arch/powerpc/include/asm/paravirt.h @@ -28,19 +28,35 @@ static inline u32 yield_count_of(int cpu) return be32_to_cpu(yield_count); } +/* + * Spinlock code confers and prods, so don't trace the hcalls because the + * tracing code takes spinlocks which can cause recursion deadlocks. + * + * These calls are made while the lock is not held: the lock slowpath yields if + * it can not acquire the lock, and unlock slow path might prod if a waiter has + * yielded). So this may not be a problem for simple spin locks because the + * tracing does not technically recurse on the lock, but we avoid it anyway. + * + * However the queued spin lock contended path is more strictly ordered: the + * H_CONFER hcall is made after the task has queued itself on the lock, so then + * recursing on that lock will cause the task to then queue up again behind the + * first instance (or worse: queued spinlocks use tricks that assume a context + * never waits on more than one spinlock, so such recursion may cause random + * corruption in the lock code). + */ static inline void yield_to_preempted(int cpu, u32 yield_count) { - plpar_hcall_norets(H_CONFER, get_hard_smp_processor_id(cpu), yield_count); + plpar_hcall_norets_notrace(H_CONFER, get_hard_smp_processor_id(cpu), yield_count); } static inline void prod_cpu(int cpu) { - plpar_hcall_norets(H_PROD, get_hard_smp_processor_id(cpu)); + plpar_hcall_norets_notrace(H_PROD, get_hard_smp_processor_id(cpu)); } static inline void yield_to_any(void) { - plpar_hcall_norets(H_CONFER, -1, 0); + plpar_hcall_norets_notrace(H_CONFER, -1, 0); } #else static inline bool is_shared_processor(void) diff --git a/arch/powerpc/platforms/pseries/hvCall.S b/arch/powerpc/platforms/pseries/hvCall.S index 2136e42833af3..8a2b8d64265bc 100644 --- a/arch/powerpc/platforms/pseries/hvCall.S +++ b/arch/powerpc/platforms/pseries/hvCall.S @@ -102,6 +102,16 @@ END_FTR_SECTION(0, 1); \ #define HCALL_BRANCH(LABEL) #endif +_GLOBAL_TOC(plpar_hcall_norets_notrace) + HMT_MEDIUM + + mfcr r0 + stw r0,8(r1) + HVSC /* invoke the hypervisor */ + lwz r0,8(r1) + mtcrf 0xff,r0 + blr /* return r3 = status */ + _GLOBAL_TOC(plpar_hcall_norets) HMT_MEDIUM diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 1f3152ad72132..b619568a4d04a 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -1830,8 +1830,7 @@ void hcall_tracepoint_unregfunc(void) /* * Since the tracing code might execute hcalls we need to guard against - * recursion. One example of this are spinlocks calling H_YIELD on - * shared processor partitions. + * recursion. */ static DEFINE_PER_CPU(unsigned int, hcall_trace_depth); From a3f1a39a5643d5c5ed3eee4edd933e0ebfeeed6e Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sat, 8 May 2021 20:14:53 +1000 Subject: [PATCH 171/263] powerpc/pseries: Don't trace hcall tracing wrapper This doesn't seem very useful to trace before the recursion check, even if the ftrace code has any recursion checks of its own. Be on the safe side and don't trace the hcall trace wrappers. Reported-by: Naveen N. Rao Signed-off-by: Nicholas Piggin Reviewed-by: Naveen N. Rao Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210508101455.1578318-3-npiggin@gmail.com --- arch/powerpc/platforms/pseries/lpar.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index b619568a4d04a..d79d7410c3204 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -1835,7 +1835,7 @@ void hcall_tracepoint_unregfunc(void) static DEFINE_PER_CPU(unsigned int, hcall_trace_depth); -void __trace_hcall_entry(unsigned long opcode, unsigned long *args) +notrace void __trace_hcall_entry(unsigned long opcode, unsigned long *args) { unsigned long flags; unsigned int *depth; @@ -1863,7 +1863,7 @@ void __trace_hcall_entry(unsigned long opcode, unsigned long *args) local_irq_restore(flags); } -void __trace_hcall_exit(long opcode, long retval, unsigned long *retbuf) +notrace void __trace_hcall_exit(long opcode, long retval, unsigned long *retbuf) { unsigned long flags; unsigned int *depth; From 7058f4b13edd9dd2cb3c5b4fe340d8307dbe0208 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sat, 8 May 2021 20:14:54 +1000 Subject: [PATCH 172/263] powerpc/pseries: use notrace hcall variant for H_CEDE idle Rather than special-case H_CEDE in the hcall trace wrappers, make the idle H_CEDE call use plpar_hcall_norets_notrace(). Signed-off-by: Nicholas Piggin Reviewed-by: Naveen N. Rao Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210508101455.1578318-4-npiggin@gmail.com --- arch/powerpc/include/asm/plpar_wrappers.h | 6 +++++- arch/powerpc/platforms/pseries/lpar.c | 10 ---------- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/include/asm/plpar_wrappers.h b/arch/powerpc/include/asm/plpar_wrappers.h index ece84a430701f..83e0f701ebc67 100644 --- a/arch/powerpc/include/asm/plpar_wrappers.h +++ b/arch/powerpc/include/asm/plpar_wrappers.h @@ -28,7 +28,11 @@ static inline void set_cede_latency_hint(u8 latency_hint) static inline long cede_processor(void) { - return plpar_hcall_norets(H_CEDE); + /* + * We cannot call tracepoints inside RCU idle regions which + * means we must not trace H_CEDE. + */ + return plpar_hcall_norets_notrace(H_CEDE); } static inline long extended_cede_processor(unsigned long latency_hint) diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index d79d7410c3204..ad1cec80019bb 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -1840,13 +1840,6 @@ notrace void __trace_hcall_entry(unsigned long opcode, unsigned long *args) unsigned long flags; unsigned int *depth; - /* - * We cannot call tracepoints inside RCU idle regions which - * means we must not trace H_CEDE. - */ - if (opcode == H_CEDE) - return; - local_irq_save(flags); depth = this_cpu_ptr(&hcall_trace_depth); @@ -1868,9 +1861,6 @@ notrace void __trace_hcall_exit(long opcode, long retval, unsigned long *retbuf) unsigned long flags; unsigned int *depth; - if (opcode == H_CEDE) - return; - local_irq_save(flags); depth = this_cpu_ptr(&hcall_trace_depth); From 4f242fc5f2e24412b89e934dad025b10293b2712 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sat, 8 May 2021 20:14:55 +1000 Subject: [PATCH 173/263] powerpc/pseries: warn if recursing into the hcall tracing code The hcall tracing code has a recursion check built in, which skips tracing if we are already tracing an hcall. However if the tracing code has problems with recursion, this check may not catch all cases because the tracing code could be invoked from a different tracepoint first, then make an hcall that gets traced, then recurse. Add an explicit warning if recursion is detected here, which might help to notice tracing code making hcalls. Really the core trace code should have its own recursion checking and warnings though. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210508101455.1578318-5-npiggin@gmail.com --- arch/powerpc/platforms/pseries/lpar.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index ad1cec80019bb..dab356e3ff87c 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -1829,8 +1829,14 @@ void hcall_tracepoint_unregfunc(void) #endif /* - * Since the tracing code might execute hcalls we need to guard against - * recursion. + * Keep track of hcall tracing depth and prevent recursion. Warn if any is + * detected because it may indicate a problem. This will not catch all + * problems with tracing code making hcalls, because the tracing might have + * been invoked from a non-hcall, so the first hcall could recurse into it + * without warning here, but this better than nothing. + * + * Hcalls with specific problems being traced should use the _notrace + * plpar_hcall variants. */ static DEFINE_PER_CPU(unsigned int, hcall_trace_depth); @@ -1844,7 +1850,7 @@ notrace void __trace_hcall_entry(unsigned long opcode, unsigned long *args) depth = this_cpu_ptr(&hcall_trace_depth); - if (*depth) + if (WARN_ON_ONCE(*depth)) goto out; (*depth)++; @@ -1865,7 +1871,7 @@ notrace void __trace_hcall_exit(long opcode, long retval, unsigned long *retbuf) depth = this_cpu_ptr(&hcall_trace_depth); - if (*depth) + if (*depth) /* Don't warn again on the way out */ goto out; (*depth)++; From 7315e457d6bc342d06ba0b7ee498221c5237a547 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 8 May 2021 09:25:32 +0000 Subject: [PATCH 174/263] powerpc/uaccess: Fix __get_user() with CONFIG_CC_HAS_ASM_GOTO_OUTPUT Building kernel mainline with GCC 11 leads to following failure when starting 'init': init[1]: bad frame in sys_sigreturn: 7ff5a900 nip 001083cc lr 001083c4 Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b This is an issue due to a segfault happening in __unsafe_restore_general_regs() in a loop copying registers from user to kernel: 10: 7d 09 03 a6 mtctr r8 14: 80 ca 00 00 lwz r6,0(r10) 18: 80 ea 00 04 lwz r7,4(r10) 1c: 90 c9 00 08 stw r6,8(r9) 20: 90 e9 00 0c stw r7,12(r9) 24: 39 0a 00 08 addi r8,r10,8 28: 39 29 00 08 addi r9,r9,8 2c: 81 4a 00 08 lwz r10,8(r10) <== r10 is clobbered here 30: 81 6a 00 0c lwz r11,12(r10) 34: 91 49 00 08 stw r10,8(r9) 38: 91 69 00 0c stw r11,12(r9) 3c: 39 48 00 08 addi r10,r8,8 40: 39 29 00 08 addi r9,r9,8 44: 42 00 ff d0 bdnz 14 <__unsafe_restore_general_regs+0x14> As shown above, this is due to r10 being re-used by GCC. This didn't happen with CLANG. This is fixed by tagging 'x' output as an earlyclobber operand in __get_user_asm2_goto(). Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/cf0a050d124d4f426cdc7a74009d17b01d8d8969.1620465917.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/uaccess.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index a09e4240c5b16..22c79ab400060 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -157,7 +157,7 @@ do { \ "2: lwz%X1 %L0, %L1\n" \ EX_TABLE(1b, %l2) \ EX_TABLE(2b, %l2) \ - : "=r" (x) \ + : "=&r" (x) \ : "m" (*addr) \ : \ : label) From bc581dbab26edf0b6acc98c76943b4a0c7d672a2 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 8 May 2021 09:25:44 +0000 Subject: [PATCH 175/263] powerpc/signal: Fix possible build failure with unsafe_copy_fpr_{to/from}_user When neither CONFIG_VSX nor CONFIG_PPC_FPU_REGS are selected, unsafe_copy_fpr_to_user() and unsafe_copy_fpr_from_user() are doing nothing. Then, unless the 'label' operand is used elsewhere, GCC complains about it being defined but not used. To fix that, add an impossible 'goto label'. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/cadc0a328bc8e6c5bf133193e7547d5c10ae7895.1620465920.git.christophe.leroy@csgroup.eu --- arch/powerpc/kernel/signal.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/signal.h b/arch/powerpc/kernel/signal.h index f4aafa337c2ed..1f07317964e49 100644 --- a/arch/powerpc/kernel/signal.h +++ b/arch/powerpc/kernel/signal.h @@ -166,9 +166,9 @@ copy_ckfpr_from_user(struct task_struct *task, void __user *from) } #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ #else -#define unsafe_copy_fpr_to_user(to, task, label) do { } while (0) +#define unsafe_copy_fpr_to_user(to, task, label) do { if (0) goto label;} while (0) -#define unsafe_copy_fpr_from_user(task, from, label) do { } while (0) +#define unsafe_copy_fpr_from_user(task, from, label) do { if (0) goto label;} while (0) static inline unsigned long copy_fpr_to_user(void __user *to, struct task_struct *task) From 63970f3c37e75997ed86dbdfdc83df35f2152bb1 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 8 May 2021 06:36:21 +0000 Subject: [PATCH 176/263] powerpc/legacy_serial: Fix UBSAN: array-index-out-of-bounds UBSAN complains when a pointer is calculated with invalid 'legacy_serial_console' index, allthough the index is verified before dereferencing the pointer. Fix it by checking 'legacy_serial_console' validity before calculating pointers. Fixes: 0bd3f9e953bd ("powerpc/legacy_serial: Use early_ioremap()") Reported-by: Paul Menzel Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210511010712.750096-1-mpe@ellerman.id.au --- arch/powerpc/kernel/legacy_serial.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/legacy_serial.c b/arch/powerpc/kernel/legacy_serial.c index 8b2c1a8553a0e..cfc03e016ff2d 100644 --- a/arch/powerpc/kernel/legacy_serial.c +++ b/arch/powerpc/kernel/legacy_serial.c @@ -356,13 +356,16 @@ static void __init setup_legacy_serial_console(int console) static int __init ioremap_legacy_serial_console(void) { - struct legacy_serial_info *info = &legacy_serial_infos[legacy_serial_console]; - struct plat_serial8250_port *port = &legacy_serial_ports[legacy_serial_console]; + struct plat_serial8250_port *port; + struct legacy_serial_info *info; void __iomem *vaddr; if (legacy_serial_console < 0) return 0; + info = &legacy_serial_infos[legacy_serial_console]; + port = &legacy_serial_ports[legacy_serial_console]; + if (!info->early_addr) return 0; From da3bb206c9ceb0736d9e2897ea697acabad35833 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 11 May 2021 20:54:59 +1000 Subject: [PATCH 177/263] KVM: PPC: Book3S HV: Fix kvm_unmap_gfn_range_hv() for Hash MMU Commit 32b48bf8514c ("KVM: PPC: Book3S HV: Fix conversion to gfn-based MMU notifier callbacks") fixed kvm_unmap_gfn_range_hv() by adding a for loop over each gfn in the range. But for the Hash MMU it repeatedly calls kvm_unmap_rmapp() with the first gfn of the range, rather than iterating through the range. This exhibits as strange guest behaviour, sometimes crashing in firmare, or booting and then guest userspace crashing unexpectedly. Fix it by passing the iterator, gfn, to kvm_unmap_rmapp(). Fixes: 32b48bf8514c ("KVM: PPC: Book3S HV: Fix conversion to gfn-based MMU notifier callbacks") Reviewed-by: Sean Christopherson Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210511105459.800788-1-mpe@ellerman.id.au --- arch/powerpc/kvm/book3s_64_mmu_hv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 2d9193cd73be4..c63e263312a4f 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -840,7 +840,7 @@ bool kvm_unmap_gfn_range_hv(struct kvm *kvm, struct kvm_gfn_range *range) kvm_unmap_radix(kvm, range->slot, gfn); } else { for (gfn = range->start; gfn < range->end; gfn++) - kvm_unmap_rmapp(kvm, range->slot, range->start); + kvm_unmap_rmapp(kvm, range->slot, gfn); } return false; From e9f4eee9a0023ba22db9560d4cc6ee63f933dae8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 11 May 2021 21:38:36 -0400 Subject: [PATCH 178/263] blk-iocost: fix weight updates of inner active iocgs When the weight of an active iocg is updated, weight_updated() is called which in turn calls __propagate_weights() to update the active and inuse weights so that the effective hierarchical weights are update accordingly. The current implementation is incorrect for inner active nodes. For an active leaf iocg, inuse can be any value between 1 and active and the difference represents how much the iocg is donating. When weight is updated, as long as inuse is clamped between 1 and the new weight, we're alright and this is what __propagate_weights() currently implements. However, that's not how an active inner node's inuse is set. An inner node's inuse is solely determined by the ratio between the sums of inuse's and active's of its children - ie. they're results of propagating the leaves' active and inuse weights upwards. __propagate_weights() incorrectly applies the same clamping as for a leaf when an active inner node's weight is updated. Consider a hierarchy which looks like the following with saturating workloads in AA and BB. R / \ A B | | AA BB 1. For both A and B, active=100, inuse=100, hwa=0.5, hwi=0.5. 2. echo 200 > A/io.weight 3. __propagate_weights() update A's active to 200 and leave inuse at 100 as it's already between 1 and the new active, making A:active=200, A:inuse=100. As R's active_sum is updated along with A's active, A:hwa=2/3, B:hwa=1/3. However, because the inuses didn't change, the hwi's remain unchanged at 0.5. 4. The weight of A is now twice that of B but AA and BB still have the same hwi of 0.5 and thus are doing the same amount of IOs. Fix it by making __propgate_weights() always calculate the inuse of an active inner iocg based on the ratio of child_inuse_sum to child_active_sum. Signed-off-by: Tejun Heo Reported-by: Dan Schatzberg Fixes: 7caa47151ab2 ("blkcg: implement blk-iocost") Cc: stable@vger.kernel.org # v5.4+ Link: https://lore.kernel.org/r/YJsxnLZV1MnBcqjj@slm.duckdns.org Signed-off-by: Jens Axboe --- block/blk-iocost.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index e0c4baa018578..c2d6bc88d3f15 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -1069,7 +1069,17 @@ static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse, lockdep_assert_held(&ioc->lock); - inuse = clamp_t(u32, inuse, 1, active); + /* + * For an active leaf node, its inuse shouldn't be zero or exceed + * @active. An active internal node's inuse is solely determined by the + * inuse to active ratio of its children regardless of @inuse. + */ + if (list_empty(&iocg->active_list) && iocg->child_active_sum) { + inuse = DIV64_U64_ROUND_UP(active * iocg->child_inuse_sum, + iocg->child_active_sum); + } else { + inuse = clamp_t(u32, inuse, 1, active); + } iocg->last_inuse = iocg->inuse; if (save) @@ -1086,7 +1096,7 @@ static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse, /* update the level sums */ parent->child_active_sum += (s32)(active - child->active); parent->child_inuse_sum += (s32)(inuse - child->inuse); - /* apply the udpates */ + /* apply the updates */ child->active = active; child->inuse = inuse; From ca298241bc229303ff683db7265a2c625a9c00fe Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 11 May 2021 14:38:47 -0700 Subject: [PATCH 179/263] f2fs: avoid swapon failure by giving a warning first The final solution can be migrating blocks to form a section-aligned file internally. Meanwhile, let's ask users to do that when preparing the swap file initially like: 1) create() 2) ioctl(F2FS_IOC_SET_PIN_FILE) 3) fallocate() Reported-by: kernel test robot Fixes: 36e4d95891ed ("f2fs: check if swapfile is section-alligned") Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 33e56ae84e358..41e260680b27c 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3801,6 +3801,7 @@ static int f2fs_is_file_aligned(struct inode *inode) block_t pblock; unsigned long nr_pblocks; unsigned int blocks_per_sec = BLKS_PER_SEC(sbi); + unsigned int not_aligned = 0; int ret = 0; cur_lblock = 0; @@ -3833,13 +3834,20 @@ static int f2fs_is_file_aligned(struct inode *inode) if ((pblock - main_blkaddr) & (blocks_per_sec - 1) || nr_pblocks & (blocks_per_sec - 1)) { - f2fs_err(sbi, "Swapfile does not align to section"); - ret = -EINVAL; - goto out; + if (f2fs_is_pinned_file(inode)) { + f2fs_err(sbi, "Swapfile does not align to section"); + ret = -EINVAL; + goto out; + } + not_aligned++; } cur_lblock += nr_pblocks; } + if (not_aligned) + f2fs_warn(sbi, "Swapfile (%u) is not align to section: \n" + "\t1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate()", + not_aligned); out: return ret; } @@ -3858,6 +3866,7 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, int nr_extents = 0; unsigned long nr_pblocks; unsigned int blocks_per_sec = BLKS_PER_SEC(sbi); + unsigned int not_aligned = 0; int ret = 0; /* @@ -3896,9 +3905,12 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, if ((pblock - SM_I(sbi)->main_blkaddr) & (blocks_per_sec - 1) || nr_pblocks & (blocks_per_sec - 1)) { - f2fs_err(sbi, "Swapfile does not align to section"); - ret = -EINVAL; - goto out; + if (f2fs_is_pinned_file(inode)) { + f2fs_err(sbi, "Swapfile does not align to section"); + ret = -EINVAL; + goto out; + } + not_aligned++; } if (cur_lblock + nr_pblocks >= sis->max) @@ -3927,6 +3939,11 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, sis->max = cur_lblock; sis->pages = cur_lblock - 1; sis->highest_bit = cur_lblock - 1; + + if (not_aligned) + f2fs_warn(sbi, "Swapfile (%u) is not align to section: \n" + "\t1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate()", + not_aligned); out: return ret; } From 02dbb7246c5bbbbe1607ebdc546ba5c454a664b1 Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Tue, 11 May 2021 20:46:09 +0530 Subject: [PATCH 180/263] sched/fair: Fix clearing of has_idle_cores flag in select_idle_cpu() In commit: 9fe1f127b913 ("sched/fair: Merge select_idle_core/cpu()") in select_idle_cpu(), we check if an idle core is present in the LLC of the target CPU via the flag "has_idle_cores". We look for the idle core in select_idle_cores(). If select_idle_cores() isn't able to find an idle core/CPU, we need to unset the has_idle_cores flag in the LLC of the target to prevent other CPUs from going down this route. However, the current code is unsetting it in the LLC of the current CPU instead of the target CPU. This patch fixes this issue. Fixes: 9fe1f127b913 ("sched/fair: Merge select_idle_core/cpu()") Signed-off-by: Gautham R. Shenoy Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Reviewed-by: Srikar Dronamraju Acked-by: Mel Gorman Link: https://lore.kernel.org/r/1620746169-13996-1-git-send-email-ego@linux.vnet.ibm.com --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 20aa234ffe04c..3248e24a90b0f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6217,7 +6217,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool } if (has_idle_core) - set_idle_cores(this, false); + set_idle_cores(target, false); if (sched_feat(SIS_PROP) && !has_idle_core) { time = cpu_clock(this) - time; From 7ea96eefb0097d243af62fc672be9f17b10338b3 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Wed, 12 May 2021 11:43:52 +0200 Subject: [PATCH 181/263] block, bfq: avoid circular stable merges BFQ may merge a new bfq_queue, stably, with the last bfq_queue created. In particular, BFQ first waits a little bit for some I/O to flow inside the new queue, say Q2, if this is needed to understand whether it is better or worse to merge Q2 with the last queue created, say Q1. This delayed stable merge is performed by assigning bic->stable_merge_bfqq = Q1, for the bic associated with Q1. Yet, while waiting for some I/O to flow in Q2, a non-stable queue merge of Q2 with Q1 may happen, causing the bic previously associated with Q2 to be associated with exactly Q1 (bic->bfqq = Q1). After that, Q2 and Q1 may happen to be split, and, in the split, Q1 may happen to be recycled as a non-shared bfq_queue. In that case, Q1 may then happen to undergo a stable merge with the bfq_queue pointed by bic->stable_merge_bfqq. Yet bic->stable_merge_bfqq still points to Q1. So Q1 would be merged with itself. This commit fixes this error by intercepting this situation, and canceling the schedule of the stable merge. Fixes: 430a67f9d616 ("block, bfq: merge bursts of newly-created queues") Signed-off-by: Pietro Pedroni Signed-off-by: Paolo Valente Link: https://lore.kernel.org/r/20210512094352.85545-2-paolo.valente@linaro.org Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 59b2499d3f8be..acd1f881273e0 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -372,9 +372,38 @@ struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync) return bic->bfqq[is_sync]; } +static void bfq_put_stable_ref(struct bfq_queue *bfqq); + void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync) { + /* + * If bfqq != NULL, then a non-stable queue merge between + * bic->bfqq and bfqq is happening here. This causes troubles + * in the following case: bic->bfqq has also been scheduled + * for a possible stable merge with bic->stable_merge_bfqq, + * and bic->stable_merge_bfqq == bfqq happens to + * hold. Troubles occur because bfqq may then undergo a split, + * thereby becoming eligible for a stable merge. Yet, if + * bic->stable_merge_bfqq points exactly to bfqq, then bfqq + * would be stably merged with itself. To avoid this anomaly, + * we cancel the stable merge if + * bic->stable_merge_bfqq == bfqq. + */ bic->bfqq[is_sync] = bfqq; + + if (bfqq && bic->stable_merge_bfqq == bfqq) { + /* + * Actually, these same instructions are executed also + * in bfq_setup_cooperator, in case of abort or actual + * execution of a stable merge. We could avoid + * repeating these instructions there too, but if we + * did so, we would nest even more complexity in this + * function. + */ + bfq_put_stable_ref(bic->stable_merge_bfqq); + + bic->stable_merge_bfqq = NULL; + } } struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) @@ -2630,8 +2659,6 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd, struct bfq_queue *bfqq); -static void bfq_put_stable_ref(struct bfq_queue *bfqq); - /* * Attempt to schedule a merge of bfqq with the currently in-service * queue or with a close queue among the scheduled queues. Return From 190515f610946db025cdedebde93958b725fb583 Mon Sep 17 00:00:00 2001 From: Lin Feng Date: Wed, 12 May 2021 18:01:24 +0800 Subject: [PATCH 182/263] blkdev.h: remove unused codes blk_account_rq Last users of blk_account_rq gone with patch commit a1ce35fa49852db ("block: remove dead elevator code") and now it gets no caller, it can be safely removed. Signed-off-by: Lin Feng Link: https://lore.kernel.org/r/20210512100124.173769-1-linf@wangsu.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b91ba6207365b..26c3e368656f0 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -677,11 +677,6 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); extern void blk_set_pm_only(struct request_queue *q); extern void blk_clear_pm_only(struct request_queue *q); -static inline bool blk_account_rq(struct request *rq) -{ - return (rq->rq_flags & RQF_STARTED) && !blk_rq_is_passthrough(rq); -} - #define list_entry_rq(ptr) list_entry((ptr), struct request, queuelist) #define rq_data_dir(rq) (op_is_write(req_op(rq)) ? WRITE : READ) From 2404b8747019184002823dba7d2f0ecf89d802b7 Mon Sep 17 00:00:00 2001 From: Sumeet Pawnikar Date: Tue, 11 May 2021 23:31:42 +0530 Subject: [PATCH 183/263] ACPI: PM: Add ACPI ID of Alder Lake Fan Add a new unique fan ACPI device ID for Alder Lake to support it in acpi_dev_pm_attach() function. Fixes: 38748bcb940e ("ACPI: DPTF: Support Alder Lake") Signed-off-by: Sumeet Pawnikar Acked-by: Zhang Rui Cc: 5.10+ # 5.10+ Signed-off-by: Rafael J. Wysocki --- drivers/acpi/device_pm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c index 096153761ebc3..58876248b1921 100644 --- a/drivers/acpi/device_pm.c +++ b/drivers/acpi/device_pm.c @@ -1310,6 +1310,7 @@ int acpi_dev_pm_attach(struct device *dev, bool power_on) {"PNP0C0B", }, /* Generic ACPI fan */ {"INT3404", }, /* Fan */ {"INTC1044", }, /* Fan for Tiger Lake generation */ + {"INTC1048", }, /* Fan for Alder Lake generation */ {} }; struct acpi_device *adev = ACPI_COMPANION(dev); From f395183f9544ba2f56b25938d6ea7042bd873521 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 12 May 2021 07:38:00 -0700 Subject: [PATCH 184/263] f2fs: return EINVAL for hole cases in swap file This tries to fix xfstests/generic/495. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 41e260680b27c..009a09fb9d88c 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3896,7 +3896,7 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, /* hole */ if (!(map.m_flags & F2FS_MAP_FLAGS)) { f2fs_err(sbi, "Swapfile has holes\n"); - ret = -ENOENT; + ret = -EINVAL; goto out; } @@ -4052,7 +4052,7 @@ static int check_swap_activate(struct swap_info_struct *sis, return ret; bad_bmap: f2fs_err(sbi, "Swapfile has holes\n"); - return -ENOENT; + return -EINVAL; } static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, From 79ebe9110fa458d58f1fceb078e2068d7ad37390 Mon Sep 17 00:00:00 2001 From: Sun Ke Date: Wed, 12 May 2021 19:43:30 +0800 Subject: [PATCH 185/263] nbd: Fix NULL pointer in flush_workqueue Open /dev/nbdX first, the config_refs will be 1 and the pointers in nbd_device are still null. Disconnect /dev/nbdX, then reference a null recv_workq. The protection by config_refs in nbd_genl_disconnect is useless. [ 656.366194] BUG: kernel NULL pointer dereference, address: 0000000000000020 [ 656.368943] #PF: supervisor write access in kernel mode [ 656.369844] #PF: error_code(0x0002) - not-present page [ 656.370717] PGD 10cc87067 P4D 10cc87067 PUD 1074b4067 PMD 0 [ 656.371693] Oops: 0002 [#1] SMP [ 656.372242] CPU: 5 PID: 7977 Comm: nbd-client Not tainted 5.11.0-rc5-00040-g76c057c84d28 #1 [ 656.373661] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-buildvm-ppc64le-16.ppc.fedoraproject.org-3.fc31 04/01/2014 [ 656.375904] RIP: 0010:mutex_lock+0x29/0x60 [ 656.376627] Code: 00 0f 1f 44 00 00 55 48 89 fd 48 83 05 6f d7 fe 08 01 e8 7a c3 ff ff 48 83 05 6a d7 fe 08 01 31 c0 65 48 8b 14 25 00 6d 01 00 48 0f b1 55 d [ 656.378934] RSP: 0018:ffffc900005eb9b0 EFLAGS: 00010246 [ 656.379350] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 [ 656.379915] RDX: ffff888104cf2600 RSI: ffffffffaae8f452 RDI: 0000000000000020 [ 656.380473] RBP: 0000000000000020 R08: 0000000000000000 R09: ffff88813bd6b318 [ 656.381039] R10: 00000000000000c7 R11: fefefefefefefeff R12: ffff888102710b40 [ 656.381599] R13: ffffc900005eb9e0 R14: ffffffffb2930680 R15: ffff88810770ef00 [ 656.382166] FS: 00007fdf117ebb40(0000) GS:ffff88813bd40000(0000) knlGS:0000000000000000 [ 656.382806] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 656.383261] CR2: 0000000000000020 CR3: 0000000100c84000 CR4: 00000000000006e0 [ 656.383819] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 656.384370] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 656.384927] Call Trace: [ 656.385111] flush_workqueue+0x92/0x6c0 [ 656.385395] nbd_disconnect_and_put+0x81/0xd0 [ 656.385716] nbd_genl_disconnect+0x125/0x2a0 [ 656.386034] genl_family_rcv_msg_doit.isra.0+0x102/0x1b0 [ 656.386422] genl_rcv_msg+0xfc/0x2b0 [ 656.386685] ? nbd_ioctl+0x490/0x490 [ 656.386954] ? genl_family_rcv_msg_doit.isra.0+0x1b0/0x1b0 [ 656.387354] netlink_rcv_skb+0x62/0x180 [ 656.387638] genl_rcv+0x34/0x60 [ 656.387874] netlink_unicast+0x26d/0x590 [ 656.388162] netlink_sendmsg+0x398/0x6c0 [ 656.388451] ? netlink_rcv_skb+0x180/0x180 [ 656.388750] ____sys_sendmsg+0x1da/0x320 [ 656.389038] ? ____sys_recvmsg+0x130/0x220 [ 656.389334] ___sys_sendmsg+0x8e/0xf0 [ 656.389605] ? ___sys_recvmsg+0xa2/0xf0 [ 656.389889] ? handle_mm_fault+0x1671/0x21d0 [ 656.390201] __sys_sendmsg+0x6d/0xe0 [ 656.390464] __x64_sys_sendmsg+0x23/0x30 [ 656.390751] do_syscall_64+0x45/0x70 [ 656.391017] entry_SYSCALL_64_after_hwframe+0x44/0xa9 To fix it, just add if (nbd->recv_workq) to nbd_disconnect_and_put(). Fixes: e9e006f5fcf2 ("nbd: fix max number of supported devs") Signed-off-by: Sun Ke Reviewed-by: Josef Bacik Link: https://lore.kernel.org/r/20210512114331.1233964-2-sunke32@huawei.com Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 4ff71b579cfcc..974da561b8e5e 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1980,7 +1980,8 @@ static void nbd_disconnect_and_put(struct nbd_device *nbd) * config ref and try to destroy the workqueue from inside the work * queue. */ - flush_workqueue(nbd->recv_workq); + if (nbd->recv_workq) + flush_workqueue(nbd->recv_workq); if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF, &nbd->config->runtime_flags)) nbd_config_put(nbd); From bedf78c4cbbbb65e42ede5ca2bd21887ef5b7060 Mon Sep 17 00:00:00 2001 From: Sun Ke Date: Wed, 12 May 2021 19:43:31 +0800 Subject: [PATCH 186/263] nbd: share nbd_put and return by goto put_nbd MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the following two statements by the statement “goto put_nbd;” nbd_put(nbd); return 0; Signed-off-by: Sun Ke Suggested-by: Markus Elfring Reviewed-by: Josef Bacik Link: https://lore.kernel.org/r/20210512114331.1233964-3-sunke32@huawei.com Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 974da561b8e5e..45d2c28c8fc83 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -2015,12 +2015,11 @@ static int nbd_genl_disconnect(struct sk_buff *skb, struct genl_info *info) return -EINVAL; } mutex_unlock(&nbd_index_mutex); - if (!refcount_inc_not_zero(&nbd->config_refs)) { - nbd_put(nbd); - return 0; - } + if (!refcount_inc_not_zero(&nbd->config_refs)) + goto put_nbd; nbd_disconnect_and_put(nbd); nbd_config_put(nbd); +put_nbd: nbd_put(nbd); return 0; } From dbb5afad100a828c97e012c6106566d99f041db6 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 12 May 2021 15:33:08 +0200 Subject: [PATCH 187/263] ptrace: make ptrace() fail if the tracee changed its pid unexpectedly Suppose we have 2 threads, the group-leader L and a sub-theread T, both parked in ptrace_stop(). Debugger tries to resume both threads and does ptrace(PTRACE_CONT, T); ptrace(PTRACE_CONT, L); If the sub-thread T execs in between, the 2nd PTRACE_CONT doesn not resume the old leader L, it resumes the post-exec thread T which was actually now stopped in PTHREAD_EVENT_EXEC. In this case the PTHREAD_EVENT_EXEC event is lost, and the tracer can't know that the tracee changed its pid. This patch makes ptrace() fail in this case until debugger does wait() and consumes PTHREAD_EVENT_EXEC which reports old_pid. This affects all ptrace requests except the "asynchronous" PTRACE_INTERRUPT/KILL. The patch doesn't add the new PTRACE_ option to not complicate the API, and I _hope_ this won't cause any noticeable regression: - If debugger uses PTRACE_O_TRACEEXEC and the thread did an exec and the tracer does a ptrace request without having consumed the exec event, it's 100% sure that the thread the ptracer thinks it is targeting does not exist anymore, or isn't the same as the one it thinks it is targeting. - To some degree this patch adds nothing new. In the scenario above ptrace(L) can fail with -ESRCH if it is called after the execing sub-thread wakes the leader up and before it "steals" the leader's pid. Test-case: #include #include #include #include #include #include #include #include void *tf(void *arg) { execve("/usr/bin/true", NULL, NULL); assert(0); return NULL; } int main(void) { int leader = fork(); if (!leader) { kill(getpid(), SIGSTOP); pthread_t th; pthread_create(&th, NULL, tf, NULL); for (;;) pause(); return 0; } waitpid(leader, NULL, WSTOPPED); ptrace(PTRACE_SEIZE, leader, 0, PTRACE_O_TRACECLONE | PTRACE_O_TRACEEXEC); waitpid(leader, NULL, 0); ptrace(PTRACE_CONT, leader, 0,0); waitpid(leader, NULL, 0); int status, thread = waitpid(-1, &status, 0); assert(thread > 0 && thread != leader); assert(status == 0x80137f); ptrace(PTRACE_CONT, thread, 0,0); /* * waitid() because waitpid(leader, &status, WNOWAIT) does not * report status. Why ???? * * Why WEXITED? because we have another kernel problem connected * to mt-exec. */ siginfo_t info; assert(waitid(P_PID, leader, &info, WSTOPPED|WEXITED|WNOWAIT) == 0); assert(info.si_pid == leader && info.si_status == 0x0405); /* OK, it sleeps in ptrace(PTRACE_EVENT_EXEC == 0x04) */ assert(ptrace(PTRACE_CONT, leader, 0,0) == -1); assert(errno == ESRCH); assert(leader == waitpid(leader, &status, WNOHANG)); assert(status == 0x04057f); assert(ptrace(PTRACE_CONT, leader, 0,0) == 0); return 0; } Signed-off-by: Oleg Nesterov Reported-by: Simon Marchi Acked-by: "Eric W. Biederman" Acked-by: Pedro Alves Acked-by: Simon Marchi Acked-by: Jan Kratochvil Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 76f09456ec4bc..2997ca600d186 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -170,6 +170,21 @@ void __ptrace_unlink(struct task_struct *child) spin_unlock(&child->sighand->siglock); } +static bool looks_like_a_spurious_pid(struct task_struct *task) +{ + if (task->exit_code != ((PTRACE_EVENT_EXEC << 8) | SIGTRAP)) + return false; + + if (task_pid_vnr(task) == task->ptrace_message) + return false; + /* + * The tracee changed its pid but the PTRACE_EVENT_EXEC event + * was not wait()'ed, most probably debugger targets the old + * leader which was destroyed in de_thread(). + */ + return true; +} + /* Ensure that nothing can wake it up, even SIGKILL */ static bool ptrace_freeze_traced(struct task_struct *task) { @@ -180,7 +195,8 @@ static bool ptrace_freeze_traced(struct task_struct *task) return ret; spin_lock_irq(&task->sighand->siglock); - if (task_is_traced(task) && !__fatal_signal_pending(task)) { + if (task_is_traced(task) && !looks_like_a_spurious_pid(task) && + !__fatal_signal_pending(task)) { task->state = __TASK_TRACED; ret = true; } From 85428beac80dbcace5b146b218697c73e367dcf5 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Wed, 12 May 2021 16:50:05 +0200 Subject: [PATCH 188/263] nvmet: seset ns->file when open fails Reset the ns->file value to NULL also in the error case in nvmet_file_ns_enable(). The ns->file variable points either to file object or contains the error code after the filp_open() call. This can lead to following problem: When the user first setups an invalid file backend and tries to enable the ns, it will fail. Then the user switches over to a bdev backend and enables successfully the ns. The first received I/O will crash the system because the IO backend is chosen based on the ns->file value: static u16 nvmet_parse_io_cmd(struct nvmet_req *req) { [...] if (req->ns->file) return nvmet_file_parse_io_cmd(req); return nvmet_bdev_parse_io_cmd(req); } Reported-by: Enzo Matsumiya Signed-off-by: Daniel Wagner Signed-off-by: Christoph Hellwig --- drivers/nvme/target/io-cmd-file.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c index 715d4376c9979..7fdbdc496597d 100644 --- a/drivers/nvme/target/io-cmd-file.c +++ b/drivers/nvme/target/io-cmd-file.c @@ -49,9 +49,11 @@ int nvmet_file_ns_enable(struct nvmet_ns *ns) ns->file = filp_open(ns->device_path, flags, 0); if (IS_ERR(ns->file)) { - pr_err("failed to open file %s: (%ld)\n", - ns->device_path, PTR_ERR(ns->file)); - return PTR_ERR(ns->file); + ret = PTR_ERR(ns->file); + pr_err("failed to open file %s: (%d)\n", + ns->device_path, ret); + ns->file = NULL; + return ret; } ret = nvmet_file_ns_revalidate(ns); From 4819d16d91145966ce03818a95169df1fd56b299 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Wed, 21 Apr 2021 18:33:58 +0300 Subject: [PATCH 189/263] drm/i915: Avoid div-by-zero on gen2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Gen2 tiles are 2KiB in size so i915_gem_object_get_tile_row_size() can in fact return <4KiB, which leads to div-by-zero here. Avoid that. Not sure i915_gem_object_get_tile_row_size() is entirely sane anyway since it doesn't account for the different tile layouts on i8xx/i915... I'm not able to hit this before commit 6846895fde05 ("drm/i915: Replace PIN_NONFAULT with calls to PIN_NOEVICT") and it looks like I also need to run recent version of Mesa. With those in place xonotic trips on this quite easily on my 85x. Cc: stable@vger.kernel.org Reviewed-by: Chris Wilson Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20210421153401.13847-2-ville.syrjala@linux.intel.com (cherry picked from commit ed52c62d386f764194e0184fdb905d5f24194cae) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/gem/i915_gem_mman.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/i915_gem_mman.c index 23f6b00e08e21..f6fe5cb014382 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c @@ -189,7 +189,7 @@ compute_partial_view(const struct drm_i915_gem_object *obj, struct i915_ggtt_view view; if (i915_gem_object_is_tiled(obj)) - chunk = roundup(chunk, tile_row_pages(obj)); + chunk = roundup(chunk, tile_row_pages(obj) ?: 1); view.type = I915_GGTT_VIEW_PARTIAL; view.partial.offset = rounddown(page_offset, chunk); From 04d019961fd15de92874575536310243a0d4c5c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Wed, 21 Apr 2021 18:33:59 +0300 Subject: [PATCH 190/263] drm/i915: Read C0DRB3/C1DRB3 as 16 bits again MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We've defined C0DRB3/C1DRB3 as 16 bit registers, so access them as such. Fixes: 1c8242c3a4b2 ("drm/i915: Use unchecked writes for setting up the fences") Reviewed-by: Chris Wilson Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20210421153401.13847-3-ville.syrjala@linux.intel.com (cherry picked from commit f765a5b48c667bdada5e49d5e0f23f8c0687b21b) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c b/drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c index e72b7a0dc316e..8a322594210c4 100644 --- a/drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c +++ b/drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c @@ -653,8 +653,8 @@ static void detect_bit_6_swizzle(struct i915_ggtt *ggtt) * banks of memory are paired and unswizzled on the * uneven portion, so leave that as unknown. */ - if (intel_uncore_read(uncore, C0DRB3) == - intel_uncore_read(uncore, C1DRB3)) { + if (intel_uncore_read16(uncore, C0DRB3) == + intel_uncore_read16(uncore, C1DRB3)) { swizzle_x = I915_BIT_6_SWIZZLE_9_10; swizzle_y = I915_BIT_6_SWIZZLE_9; } From ea995218dddba171fecd05496c69617c5ef3c5b8 Mon Sep 17 00:00:00 2001 From: Lv Yunlong Date: Mon, 26 Apr 2021 05:43:40 -0700 Subject: [PATCH 191/263] drm/i915/gt: Fix a double free in gen8_preallocate_top_level_pdp Our code analyzer reported a double free bug. In gen8_preallocate_top_level_pdp, pde and pde->pt.base are allocated via alloc_pd(vm) with one reference. If pin_pt_dma() failed, pde->pt.base is freed by i915_gem_object_put() with a reference dropped. Then free_pd calls free_px() defined in intel_ppgtt.c, which calls i915_gem_object_put() to put pde->pt.base again. As pde->pt.base is protected by refcount, so the second put will not free pde->pt.base actually. But, maybe it is better to remove the first put? Fixes: 82adf901138cc ("drm/i915/gt: Shrink i915_page_directory's slab bucket") Signed-off-by: Lv Yunlong Reviewed-by: Matthew Auld Signed-off-by: Matthew Auld Link: https://patchwork.freedesktop.org/patch/msgid/20210426124340.4238-1-lyl2019@mail.ustc.edu.cn (cherry picked from commit ac69496fe65cca0611d5917b7d232730ff605bc7) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/gt/gen8_ppgtt.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c index 176c19633412f..74bf6fc8461fe 100644 --- a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c +++ b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c @@ -641,7 +641,6 @@ static int gen8_preallocate_top_level_pdp(struct i915_ppgtt *ppgtt) err = pin_pt_dma(vm, pde->pt.base); if (err) { - i915_gem_object_put(pde->pt.base); free_pd(vm, pde); return err; } From 402be8a101190969fc7ff122d07e262df86e132b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Marchesin?= Date: Thu, 29 Apr 2021 03:10:21 +0000 Subject: [PATCH 192/263] drm/i915: Fix crash in auto_retire MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The retire logic uses the 2 lower bits of the pointer to the retire function to store flags. However, the auto_retire function is not guaranteed to be aligned to a multiple of 4, which causes crashes as we jump to the wrong address, for example like this: 2021-04-24T18:03:53.804300Z WARNING kernel: [ 516.876901] invalid opcode: 0000 [#1] PREEMPT SMP NOPTI 2021-04-24T18:03:53.804310Z WARNING kernel: [ 516.876906] CPU: 7 PID: 146 Comm: kworker/u16:6 Tainted: G U 5.4.105-13595-g3cd84167b2df #1 2021-04-24T18:03:53.804311Z WARNING kernel: [ 516.876907] Hardware name: Google Volteer2/Volteer2, BIOS Google_Volteer2.13672.76.0 02/22/2021 2021-04-24T18:03:53.804312Z WARNING kernel: [ 516.876911] Workqueue: events_unbound active_work 2021-04-24T18:03:53.804313Z WARNING kernel: [ 516.876914] RIP: 0010:auto_retire+0x1/0x20 2021-04-24T18:03:53.804314Z WARNING kernel: [ 516.876916] Code: e8 01 f2 ff ff eb 02 31 db 48 89 d8 5b 5d c3 0f 1f 44 00 00 55 48 89 e5 f0 ff 87 c8 00 00 00 0f 88 ab 47 4a 00 31 c0 5d c3 0f <1f> 44 00 00 55 48 89 e5 f0 ff 8f c8 00 00 00 0f 88 9a 47 4a 00 74 2021-04-24T18:03:53.804319Z WARNING kernel: [ 516.876918] RSP: 0018:ffff9b4d809fbe38 EFLAGS: 00010286 2021-04-24T18:03:53.804320Z WARNING kernel: [ 516.876919] RAX: 0000000000000007 RBX: ffff927915079600 RCX: 0000000000000007 2021-04-24T18:03:53.804320Z WARNING kernel: [ 516.876921] RDX: ffff9b4d809fbe40 RSI: 0000000000000286 RDI: ffff927915079600 2021-04-24T18:03:53.804321Z WARNING kernel: [ 516.876922] RBP: ffff9b4d809fbe68 R08: 8080808080808080 R09: fefefefefefefeff 2021-04-24T18:03:53.804321Z WARNING kernel: [ 516.876924] R10: 0000000000000010 R11: ffffffff92e44bd8 R12: ffff9279150796a0 2021-04-24T18:03:53.804322Z WARNING kernel: [ 516.876925] R13: ffff92791c368180 R14: ffff927915079640 R15: 000000001c867605 2021-04-24T18:03:53.804323Z WARNING kernel: [ 516.876926] FS: 0000000000000000(0000) GS:ffff92791ffc0000(0000) knlGS:0000000000000000 2021-04-24T18:03:53.804323Z WARNING kernel: [ 516.876928] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 2021-04-24T18:03:53.804324Z WARNING kernel: [ 516.876929] CR2: 0000239514955000 CR3: 00000007f82da001 CR4: 0000000000760ee0 2021-04-24T18:03:53.804325Z WARNING kernel: [ 516.876930] PKRU: 55555554 2021-04-24T18:03:53.804325Z WARNING kernel: [ 516.876931] Call Trace: 2021-04-24T18:03:53.804326Z WARNING kernel: [ 516.876935] __active_retire+0x77/0xcf 2021-04-24T18:03:53.804326Z WARNING kernel: [ 516.876939] process_one_work+0x1da/0x394 2021-04-24T18:03:53.804327Z WARNING kernel: [ 516.876941] worker_thread+0x216/0x375 2021-04-24T18:03:53.804327Z WARNING kernel: [ 516.876944] kthread+0x147/0x156 2021-04-24T18:03:53.804335Z WARNING kernel: [ 516.876946] ? pr_cont_work+0x58/0x58 2021-04-24T18:03:53.804335Z WARNING kernel: [ 516.876948] ? kthread_blkcg+0x2e/0x2e 2021-04-24T18:03:53.804336Z WARNING kernel: [ 516.876950] ret_from_fork+0x1f/0x40 2021-04-24T18:03:53.804336Z WARNING kernel: [ 516.876952] Modules linked in: cdc_mbim cdc_ncm cdc_wdm xt_cgroup rfcomm cmac algif_hash algif_skcipher af_alg xt_MASQUERADE uinput snd_soc_rt5682_sdw snd_soc_rt5682 snd_soc_max98373_sdw snd_soc_max98373 snd_soc_rl6231 regmap_sdw snd_soc_sof_sdw snd_soc_hdac_hdmi snd_soc_dmic snd_hda_codec_hdmi snd_sof_pci snd_sof_intel_hda_common intel_ipu6_psys snd_sof_xtensa_dsp soundwire_intel soundwire_generic_allocation soundwire_cadence snd_sof_intel_hda snd_sof snd_soc_hdac_hda snd_soc_acpi_intel_match snd_soc_acpi snd_hda_ext_core soundwire_bus snd_hda_intel snd_intel_dspcfg snd_hda_codec snd_hwdep snd_hda_core intel_ipu6_isys videobuf2_dma_contig videobuf2_v4l2 videobuf2_common videobuf2_memops mei_hdcp intel_ipu6 ov2740 ov8856 at24 sx9310 dw9768 v4l2_fwnode cros_ec_typec intel_pmc_mux roles acpi_als typec fuse iio_trig_sysfs cros_ec_light_prox cros_ec_lid_angle cros_ec_sensors cros_ec_sensors_core industrialio_triggered_buffer cros_ec_sensors_ring kfifo_buf industrialio cros_ec_sensorhub 2021-04-24T18:03:53.804337Z WARNING kernel: [ 516.876972] cdc_ether usbnet iwlmvm lzo_rle lzo_compress iwl7000_mac80211 iwlwifi zram cfg80211 r8152 mii btusb btrtl btintel btbcm bluetooth ecdh_generic ecc joydev 2021-04-24T18:03:53.804337Z EMERG kernel: [ 516.879169] gsmi: Log Shutdown Reason 0x03 This change fixes this by aligning the function. Signed-off-by: Stéphane Marchesin Fixes: 229007e02d69 ("drm/i915: Wrap i915_active in a simple kreffed struct") Signed-off-by: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20210429031021.1218091-1-marcheu@chromium.org (cherry picked from commit ca419f407b43cc89942ebc297c7a63d94abbcae4) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/i915_active.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/i915_active.c b/drivers/gpu/drm/i915/i915_active.c index cf9a3d384971f..aa573b078ae75 100644 --- a/drivers/gpu/drm/i915/i915_active.c +++ b/drivers/gpu/drm/i915/i915_active.c @@ -1156,7 +1156,8 @@ static int auto_active(struct i915_active *ref) return 0; } -static void auto_retire(struct i915_active *ref) +__i915_active_call static void +auto_retire(struct i915_active *ref) { i915_active_put(ref); } From a915fe5e9601c632417ef5261af70788d7d23a8a Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Thu, 29 Apr 2021 09:35:29 +0100 Subject: [PATCH 193/263] drm/i915/overlay: Fix active retire callback alignment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit __i915_active_call annotation is required on the retire callback to ensure correct function alignment. Signed-off-by: Tvrtko Ursulin Fixes: a21ce8ad12d2 ("drm/i915/overlay: Switch to using i915_active tracking") Cc: Chris Wilson Cc: Matthew Auld Reviewed-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20210429083530.849546-1-tvrtko.ursulin@linux.intel.com (cherry picked from commit d8e44e4dd221ee283ea60a6fb87bca08807aa0ab) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/display/intel_overlay.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/display/intel_overlay.c b/drivers/gpu/drm/i915/display/intel_overlay.c index e5dadde422f74..bbaf05515e883 100644 --- a/drivers/gpu/drm/i915/display/intel_overlay.c +++ b/drivers/gpu/drm/i915/display/intel_overlay.c @@ -383,7 +383,7 @@ static void intel_overlay_off_tail(struct intel_overlay *overlay) i830_overlay_clock_gating(dev_priv, true); } -static void +__i915_active_call static void intel_overlay_last_flip_retire(struct i915_active *active) { struct intel_overlay *overlay = From e4527420ed087f99c6aa2ac22c6d3458c7dc1a94 Mon Sep 17 00:00:00 2001 From: Ankit Nautiyal Date: Tue, 11 May 2021 17:39:30 +0530 Subject: [PATCH 194/263] drm/i915: Use correct downstream caps for check Src-Ctl mode for PCON Fix the typo in DPCD caps used for checking SRC CTL mode of HDMI2.1 PCON v2: Corrected Fixes tag (Jani Nikula). v3: Rebased. Fixes: 04b6603d13be ("drm/i915/display: Configure HDMI2.1 Pcon for FRL only if Src-Ctl mode is available") Cc: Ankit Nautiyal Cc: Uma Shankar Cc: Jani Nikula Cc: "Ville Syrj_l_" Cc: Imre Deak Cc: Manasi Navare Cc: Gwan-gyeong Mun Cc: Lucas De Marchi Cc: Sean Paul Signed-off-by: Ankit Nautiyal Reviewed-by: Swati Sharma Signed-off-by: Anshuman Gupta Link: https://patchwork.freedesktop.org/patch/msgid/20210511120930.12218-1-ankit.k.nautiyal@intel.com (cherry picked from commit 88a9c5485c48ab60c89612a17fc89f4162bbdb9d) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/display/intel_dp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c index 1e026177ed1ba..642c60f3d9b18 100644 --- a/drivers/gpu/drm/i915/display/intel_dp.c +++ b/drivers/gpu/drm/i915/display/intel_dp.c @@ -2111,7 +2111,7 @@ void intel_dp_check_frl_training(struct intel_dp *intel_dp) * -PCON supports SRC_CTL_MODE (VESA DP2.0-HDMI2.1 PCON Spec Draft-1 Sec-7) * -sink is HDMI2.1 */ - if (!(intel_dp->dpcd[2] & DP_PCON_SOURCE_CTL_MODE) || + if (!(intel_dp->downstream_ports[2] & DP_PCON_SOURCE_CTL_MODE) || !intel_dp_is_hdmi_2_1_sink(intel_dp) || intel_dp->frl.is_trained) return; From 46c7405df7de8deb97229eacebcee96d61415f3f Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Wed, 12 May 2021 19:42:10 +0200 Subject: [PATCH 195/263] objtool: Fix elf_create_undef_symbol() endianness Currently x86 cross-compilation fails on big endian system with: x86_64-cross-ld: init/main.o: invalid string offset 488112128 >= 6229 for section `.strtab' Mark new ELF data in elf_create_undef_symbol() as symbol, so that libelf does endianness handling correctly. Fixes: 2f2f7e47f052 ("objtool: Add elf_create_undef_symbol()") Signed-off-by: Vasily Gorbik Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/patch-1.thread-6c9df9.git-d39264656387.your-ad-here.call-01620841104-ext-2554@work.hours --- tools/objtool/elf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index d08f5f3670f88..743c2e9d0f564 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -762,6 +762,7 @@ struct symbol *elf_create_undef_symbol(struct elf *elf, const char *name) data->d_buf = &sym->sym; data->d_size = sizeof(sym->sym); data->d_align = 1; + data->d_type = ELF_T_SYM; sym->idx = symtab->len / sizeof(sym->sym); From f66c05d6baf36069c01a02f869bebb75586f2318 Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Wed, 12 May 2021 19:42:13 +0200 Subject: [PATCH 196/263] objtool/x86: Fix elf_add_alternative() endianness Currently x86 kernel cross-compiled on big endian system fails at boot with: kernel BUG at arch/x86/kernel/alternative.c:258! Corresponding bug condition look like the following: BUG_ON(feature >= (NCAPINTS + NBUGINTS) * 32); Fix that by converting alternative feature/cpuid to target endianness. Fixes: 9bc0bb50727c ("objtool/x86: Rewrite retpoline thunk calls") Signed-off-by: Vasily Gorbik Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/patch-2.thread-6c9df9.git-6c9df9a8098d.your-ad-here.call-01620841104-ext-2554@work.hours --- tools/objtool/arch/x86/decode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index cedf3ede75455..24295d39713b2 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -19,6 +19,7 @@ #include #include #include +#include #include static int is_x86_64(const struct elf *elf) @@ -725,7 +726,7 @@ static int elf_add_alternative(struct elf *elf, return -1; } - alt->cpuid = cpuid; + alt->cpuid = bswap_if_needed(cpuid); alt->instrlen = orig_len; alt->replacementlen = repl_len; From 83a775d5f9bfda95b1c295f95a3a041a40c7f321 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 30 Apr 2021 12:37:24 +0100 Subject: [PATCH 197/263] KEYS: trusted: Fix memory leak on object td Two error return paths are neglecting to free allocated object td, causing a memory leak. Fix this by returning via the error return path that securely kfree's td. Fixes clang scan-build warning: security/keys/trusted-keys/trusted_tpm1.c:496:10: warning: Potential memory leak [unix.Malloc] Cc: stable@vger.kernel.org Fixes: 5df16caada3f ("KEYS: trusted: Fix incorrect handling of tpm_get_random()") Signed-off-by: Colin Ian King Reviewed-by: Nick Desaulniers Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- security/keys/trusted-keys/trusted_tpm1.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/security/keys/trusted-keys/trusted_tpm1.c b/security/keys/trusted-keys/trusted_tpm1.c index 4693945508019..aa108bea6739b 100644 --- a/security/keys/trusted-keys/trusted_tpm1.c +++ b/security/keys/trusted-keys/trusted_tpm1.c @@ -493,10 +493,12 @@ static int tpm_seal(struct tpm_buf *tb, uint16_t keytype, ret = tpm_get_random(chip, td->nonceodd, TPM_NONCE_SIZE); if (ret < 0) - return ret; + goto out; - if (ret != TPM_NONCE_SIZE) - return -EIO; + if (ret != TPM_NONCE_SIZE) { + ret = -EIO; + goto out; + } ordinal = htonl(TPM_ORD_SEAL); datsize = htonl(datalen); From b3ad7855b7ae3bed4242894d07bdb7f186652dbe Mon Sep 17 00:00:00 2001 From: Ben Boeckel Date: Thu, 29 Apr 2021 15:21:56 -0400 Subject: [PATCH 198/263] trusted-keys: match tpm_get_ops on all return paths The `tpm_get_ops` call at the beginning of the function is not paired with a `tpm_put_ops` on this return path. Cc: stable@vger.kernel.org Fixes: f2219745250f ("security: keys: trusted: use ASN.1 TPM2 key format for the blobs") Reported-by: Dan Carpenter Signed-off-by: Ben Boeckel Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- security/keys/trusted-keys/trusted_tpm2.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/security/keys/trusted-keys/trusted_tpm2.c b/security/keys/trusted-keys/trusted_tpm2.c index 617fabd4d913b..0165da386289c 100644 --- a/security/keys/trusted-keys/trusted_tpm2.c +++ b/security/keys/trusted-keys/trusted_tpm2.c @@ -336,9 +336,9 @@ int tpm2_seal_trusted(struct tpm_chip *chip, rc = -EPERM; } if (blob_len < 0) - return blob_len; - - payload->blob_len = blob_len; + rc = blob_len; + else + payload->blob_len = blob_len; tpm_put_ops(chip); return rc; From e630af7dfb450d1c00c30077314acf33032ff9e4 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Mon, 10 May 2021 15:28:30 +0300 Subject: [PATCH 199/263] tpm, tpm_tis: Extend locality handling to TPM2 in tpm_tis_gen_interrupt() The earlier fix (linked) only partially fixed the locality handling bug in tpm_tis_gen_interrupt(), i.e. only for TPM 1.x. Extend the locality handling to cover TPM2. Cc: Hans de Goede Cc: stable@vger.kernel.org Link: https://lore.kernel.org/linux-integrity/20210220125534.20707-1-jarkko@kernel.org/ Fixes: a3fbfae82b4c ("tpm: take TPM chip power gating out of tpm_transmit()") Reported-by: Lino Sanfilippo Signed-off-by: Jarkko Sakkinen Tested-by: Lino Sanfilippo --- drivers/char/tpm/tpm_tis_core.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/char/tpm/tpm_tis_core.c b/drivers/char/tpm/tpm_tis_core.c index a2e0395cbe618..6fa150a3b75e0 100644 --- a/drivers/char/tpm/tpm_tis_core.c +++ b/drivers/char/tpm/tpm_tis_core.c @@ -709,16 +709,14 @@ static int tpm_tis_gen_interrupt(struct tpm_chip *chip) cap_t cap; int ret; - /* TPM 2.0 */ - if (chip->flags & TPM_CHIP_FLAG_TPM2) - return tpm2_get_tpm_pt(chip, 0x100, &cap2, desc); - - /* TPM 1.2 */ ret = request_locality(chip, 0); if (ret < 0) return ret; - ret = tpm1_getcap(chip, TPM_CAP_PROP_TIS_TIMEOUT, &cap, desc, 0); + if (chip->flags & TPM_CHIP_FLAG_TPM2) + ret = tpm2_get_tpm_pt(chip, 0x100, &cap2, desc); + else + ret = tpm1_getcap(chip, TPM_CAP_PROP_TIS_TIMEOUT, &cap, desc, 0); release_locality(chip, 0); From 8a2d296aaebadd68d9c1f6908667df1d1c84c051 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Mon, 10 May 2021 15:28:31 +0300 Subject: [PATCH 200/263] tpm, tpm_tis: Reserve locality in tpm_tis_resume() Reserve locality in tpm_tis_resume(), as it could be unsert after waking up from a sleep state. Cc: stable@vger.kernel.org Cc: Lino Sanfilippo Reported-by: Hans de Goede Fixes: a3fbfae82b4c ("tpm: take TPM chip power gating out of tpm_transmit()") Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm_tis_core.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/char/tpm/tpm_tis_core.c b/drivers/char/tpm/tpm_tis_core.c index 6fa150a3b75e0..55b9d3965ae1b 100644 --- a/drivers/char/tpm/tpm_tis_core.c +++ b/drivers/char/tpm/tpm_tis_core.c @@ -1125,12 +1125,20 @@ int tpm_tis_resume(struct device *dev) if (ret) return ret; - /* TPM 1.2 requires self-test on resume. This function actually returns + /* + * TPM 1.2 requires self-test on resume. This function actually returns * an error code but for unknown reason it isn't handled. */ - if (!(chip->flags & TPM_CHIP_FLAG_TPM2)) + if (!(chip->flags & TPM_CHIP_FLAG_TPM2)) { + ret = request_locality(chip, 0); + if (ret < 0) + return ret; + tpm1_do_selftest(chip); + release_locality(chip, 0); + } + return 0; } EXPORT_SYMBOL_GPL(tpm_tis_resume); From 1df83992d977355177810c2b711afc30546c81ce Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Wed, 12 May 2021 21:39:26 +0800 Subject: [PATCH 201/263] tpm: fix error return code in tpm2_get_cc_attrs_tbl() If the total number of commands queried through TPM2_CAP_COMMANDS is different from that queried through TPM2_CC_GET_CAPABILITY, it indicates an unknown error. In this case, an appropriate error code -EFAULT should be returned. However, we currently do not explicitly assign this error code to 'rc'. As a result, 0 was incorrectly returned. Cc: stable@vger.kernel.org Fixes: 58472f5cd4f6("tpm: validate TPM 2.0 commands") Reported-by: Hulk Robot Signed-off-by: Zhen Lei Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm2-cmd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/char/tpm/tpm2-cmd.c b/drivers/char/tpm/tpm2-cmd.c index eff1f12d981ab..c84d239512197 100644 --- a/drivers/char/tpm/tpm2-cmd.c +++ b/drivers/char/tpm/tpm2-cmd.c @@ -656,6 +656,7 @@ int tpm2_get_cc_attrs_tbl(struct tpm_chip *chip) if (nr_commands != be32_to_cpup((__be32 *)&buf.data[TPM_HEADER_SIZE + 5])) { + rc = -EFAULT; tpm_buf_destroy(&buf); goto out; } From 681865a03d3ec6ac3dda147044ed2a1a0f49f7bf Mon Sep 17 00:00:00 2001 From: Wan Jiabing Date: Mon, 19 Apr 2021 19:27:25 +0800 Subject: [PATCH 202/263] libnvdimm: Remove duplicate struct declaration struct device is declared at 133rd line. The second declaration is unnecessary, remove it. Signed-off-by: Wan Jiabing Link: https://lore.kernel.org/r/20210419112725.42145-1-wanjiabing@vivo.com Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 01f251b6e36c5..89b69e645ac74 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -141,7 +141,6 @@ static inline void __iomem *devm_nvdimm_ioremap(struct device *dev, struct nvdimm_bus; struct module; -struct device; struct nd_blk_region; struct nd_blk_region_desc { int (*enable)(struct nvdimm_bus *nvdimm_bus, struct device *dev); From 7ddb4cc2b885c740523e6ea54a1f4434acfa3368 Mon Sep 17 00:00:00 2001 From: Zou Wei Date: Tue, 20 Apr 2021 15:47:47 +0800 Subject: [PATCH 203/263] tools/testing/nvdimm: Make symbol '__nfit_test_ioremap' static The sparse tool complains as follows: tools/testing/nvdimm/test/iomap.c:65:14: warning: symbol '__nfit_test_ioremap' was not declared. Should it be static? This symbol is not used outside of iomap.c, so this commit marks it static. Reported-by: Hulk Robot Signed-off-by: Zou Wei Link: https://lore.kernel.org/r/1618904867-25275-1-git-send-email-zou_wei@huawei.com Signed-off-by: Dan Williams --- tools/testing/nvdimm/test/iomap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c index c62d372d426fb..ed563bdd88f39 100644 --- a/tools/testing/nvdimm/test/iomap.c +++ b/tools/testing/nvdimm/test/iomap.c @@ -62,7 +62,7 @@ struct nfit_test_resource *get_nfit_res(resource_size_t resource) } EXPORT_SYMBOL(get_nfit_res); -void __iomem *__nfit_test_ioremap(resource_size_t offset, unsigned long size, +static void __iomem *__nfit_test_ioremap(resource_size_t offset, unsigned long size, void __iomem *(*fallback_fn)(resource_size_t, unsigned long)) { struct nfit_test_resource *nfit_res = get_nfit_res(offset); From 3dd4fe4b4dfa34e7487edfe159ef787ba397cfa9 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 21 Apr 2021 00:05:28 -0700 Subject: [PATCH 204/263] MAINTAINERS: Move nvdimm mailing list After seeing some users have subscription management trouble, more spam than other Linux development lists, and considering some of the benefits of kernel.org hosted lists, nvdimm and persistent memory development is moving to nvdimm@lists.linux.dev. The old list will remain up until v5.14-rc1 and shutdown thereafter. Cc: Ira Weiny Cc: Oliver O'Halloran Cc: Matthew Wilcox Cc: Jan Kara Cc: Jonathan Corbet Cc: Dave Jiang Cc: Vishal Verma Link: https://lore.kernel.org/r/161898872871.3406469.4054282559340528393.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- Documentation/ABI/obsolete/sysfs-class-dax | 2 +- Documentation/ABI/removed/sysfs-bus-nfit | 2 +- Documentation/ABI/testing/sysfs-bus-nfit | 40 +++++++++---------- Documentation/ABI/testing/sysfs-bus-papr-pmem | 4 +- Documentation/driver-api/nvdimm/nvdimm.rst | 2 +- MAINTAINERS | 14 +++---- 6 files changed, 32 insertions(+), 32 deletions(-) diff --git a/Documentation/ABI/obsolete/sysfs-class-dax b/Documentation/ABI/obsolete/sysfs-class-dax index 0faf1354cd054..5bcce27458e30 100644 --- a/Documentation/ABI/obsolete/sysfs-class-dax +++ b/Documentation/ABI/obsolete/sysfs-class-dax @@ -1,7 +1,7 @@ What: /sys/class/dax/ Date: May, 2016 KernelVersion: v4.7 -Contact: linux-nvdimm@lists.01.org +Contact: nvdimm@lists.linux.dev Description: Device DAX is the device-centric analogue of Filesystem DAX (CONFIG_FS_DAX). It allows memory ranges to be allocated and mapped without need of an intervening file diff --git a/Documentation/ABI/removed/sysfs-bus-nfit b/Documentation/ABI/removed/sysfs-bus-nfit index ae8c1ca538287..277437005def7 100644 --- a/Documentation/ABI/removed/sysfs-bus-nfit +++ b/Documentation/ABI/removed/sysfs-bus-nfit @@ -1,7 +1,7 @@ What: /sys/bus/nd/devices/regionX/nfit/ecc_unit_size Date: Aug, 2017 KernelVersion: v4.14 (Removed v4.18) -Contact: linux-nvdimm@lists.01.org +Contact: nvdimm@lists.linux.dev Description: (RO) Size of a write request to a DIMM that will not incur a read-modify-write cycle at the memory controller. diff --git a/Documentation/ABI/testing/sysfs-bus-nfit b/Documentation/ABI/testing/sysfs-bus-nfit index 63ef0b9ecce70..e7282d184a747 100644 --- a/Documentation/ABI/testing/sysfs-bus-nfit +++ b/Documentation/ABI/testing/sysfs-bus-nfit @@ -5,7 +5,7 @@ Interface Table (NFIT)' section in the ACPI specification What: /sys/bus/nd/devices/nmemX/nfit/serial Date: Jun, 2015 KernelVersion: v4.2 -Contact: linux-nvdimm@lists.01.org +Contact: nvdimm@lists.linux.dev Description: (RO) Serial number of the NVDIMM (non-volatile dual in-line memory module), assigned by the module vendor. @@ -14,7 +14,7 @@ Description: What: /sys/bus/nd/devices/nmemX/nfit/handle Date: Apr, 2015 KernelVersion: v4.2 -Contact: linux-nvdimm@lists.01.org +Contact: nvdimm@lists.linux.dev Description: (RO) The address (given by the _ADR object) of the device on its parent bus of the NVDIMM device containing the NVDIMM region. @@ -23,7 +23,7 @@ Description: What: /sys/bus/nd/devices/nmemX/nfit/device Date: Apr, 2015 KernelVersion: v4.1 -Contact: linux-nvdimm@lists.01.org +Contact: nvdimm@lists.linux.dev Description: (RO) Device id for the NVDIMM, assigned by the module vendor. @@ -31,7 +31,7 @@ Description: What: /sys/bus/nd/devices/nmemX/nfit/rev_id Date: Jun, 2015 KernelVersion: v4.2 -Contact: linux-nvdimm@lists.01.org +Contact: nvdimm@lists.linux.dev Description: (RO) Revision of the NVDIMM, assigned by the module vendor. @@ -39,7 +39,7 @@ Description: What: /sys/bus/nd/devices/nmemX/nfit/phys_id Date: Apr, 2015 KernelVersion: v4.2 -Contact: linux-nvdimm@lists.01.org +Contact: nvdimm@lists.linux.dev Description: (RO) Handle (i.e., instance number) for the SMBIOS (system management BIOS) Memory Device structure describing the NVDIMM @@ -49,7 +49,7 @@ Description: What: /sys/bus/nd/devices/nmemX/nfit/flags Date: Jun, 2015 KernelVersion: v4.2 -Contact: linux-nvdimm@lists.01.org +Contact: nvdimm@lists.linux.dev Description: (RO) The flags in the NFIT memory device sub-structure indicate the state of the data on the nvdimm relative to its energy @@ -68,7 +68,7 @@ What: /sys/bus/nd/devices/nmemX/nfit/format1 What: /sys/bus/nd/devices/nmemX/nfit/formats Date: Apr, 2016 KernelVersion: v4.7 -Contact: linux-nvdimm@lists.01.org +Contact: nvdimm@lists.linux.dev Description: (RO) The interface codes indicate support for persistent memory mapped directly into system physical address space and / or a @@ -84,7 +84,7 @@ Description: What: /sys/bus/nd/devices/nmemX/nfit/vendor Date: Apr, 2016 KernelVersion: v4.7 -Contact: linux-nvdimm@lists.01.org +Contact: nvdimm@lists.linux.dev Description: (RO) Vendor id of the NVDIMM. @@ -92,7 +92,7 @@ Description: What: /sys/bus/nd/devices/nmemX/nfit/dsm_mask Date: May, 2016 KernelVersion: v4.7 -Contact: linux-nvdimm@lists.01.org +Contact: nvdimm@lists.linux.dev Description: (RO) The bitmask indicates the supported device specific control functions relative to the NVDIMM command family supported by the @@ -102,7 +102,7 @@ Description: What: /sys/bus/nd/devices/nmemX/nfit/family Date: Apr, 2016 KernelVersion: v4.7 -Contact: linux-nvdimm@lists.01.org +Contact: nvdimm@lists.linux.dev Description: (RO) Displays the NVDIMM family command sets. Values 0, 1, 2 and 3 correspond to NVDIMM_FAMILY_INTEL, @@ -118,7 +118,7 @@ Description: What: /sys/bus/nd/devices/nmemX/nfit/id Date: Apr, 2016 KernelVersion: v4.7 -Contact: linux-nvdimm@lists.01.org +Contact: nvdimm@lists.linux.dev Description: (RO) ACPI specification 6.2 section 5.2.25.9, defines an identifier for an NVDIMM, which refelects the id attribute. @@ -127,7 +127,7 @@ Description: What: /sys/bus/nd/devices/nmemX/nfit/subsystem_vendor Date: Apr, 2016 KernelVersion: v4.7 -Contact: linux-nvdimm@lists.01.org +Contact: nvdimm@lists.linux.dev Description: (RO) Sub-system vendor id of the NVDIMM non-volatile memory subsystem controller. @@ -136,7 +136,7 @@ Description: What: /sys/bus/nd/devices/nmemX/nfit/subsystem_rev_id Date: Apr, 2016 KernelVersion: v4.7 -Contact: linux-nvdimm@lists.01.org +Contact: nvdimm@lists.linux.dev Description: (RO) Sub-system revision id of the NVDIMM non-volatile memory subsystem controller, assigned by the non-volatile memory subsystem @@ -146,7 +146,7 @@ Description: What: /sys/bus/nd/devices/nmemX/nfit/subsystem_device Date: Apr, 2016 KernelVersion: v4.7 -Contact: linux-nvdimm@lists.01.org +Contact: nvdimm@lists.linux.dev Description: (RO) Sub-system device id for the NVDIMM non-volatile memory subsystem controller, assigned by the non-volatile memory @@ -156,7 +156,7 @@ Description: What: /sys/bus/nd/devices/ndbusX/nfit/revision Date: Jun, 2015 KernelVersion: v4.2 -Contact: linux-nvdimm@lists.01.org +Contact: nvdimm@lists.linux.dev Description: (RO) ACPI NFIT table revision number. @@ -164,7 +164,7 @@ Description: What: /sys/bus/nd/devices/ndbusX/nfit/scrub Date: Sep, 2016 KernelVersion: v4.9 -Contact: linux-nvdimm@lists.01.org +Contact: nvdimm@lists.linux.dev Description: (RW) This shows the number of full Address Range Scrubs (ARS) that have been completed since driver load time. Userspace can @@ -177,7 +177,7 @@ Description: What: /sys/bus/nd/devices/ndbusX/nfit/hw_error_scrub Date: Sep, 2016 KernelVersion: v4.9 -Contact: linux-nvdimm@lists.01.org +Contact: nvdimm@lists.linux.dev Description: (RW) Provides a way to toggle the behavior between just adding the address (cache line) where the MCE happened to the poison @@ -196,7 +196,7 @@ Description: What: /sys/bus/nd/devices/ndbusX/nfit/dsm_mask Date: Jun, 2017 KernelVersion: v4.13 -Contact: linux-nvdimm@lists.01.org +Contact: nvdimm@lists.linux.dev Description: (RO) The bitmask indicates the supported bus specific control functions. See the section named 'NVDIMM Root Device _DSMs' in @@ -205,7 +205,7 @@ Description: What: /sys/bus/nd/devices/ndbusX/nfit/firmware_activate_noidle Date: Apr, 2020 KernelVersion: v5.8 -Contact: linux-nvdimm@lists.01.org +Contact: nvdimm@lists.linux.dev Description: (RW) The Intel platform implementation of firmware activate support exposes an option let the platform force idle devices in @@ -225,7 +225,7 @@ Description: What: /sys/bus/nd/devices/regionX/nfit/range_index Date: Jun, 2015 KernelVersion: v4.2 -Contact: linux-nvdimm@lists.01.org +Contact: nvdimm@lists.linux.dev Description: (RO) A unique number provided by the BIOS to identify an address range. Used by NVDIMM Region Mapping Structure to uniquely refer diff --git a/Documentation/ABI/testing/sysfs-bus-papr-pmem b/Documentation/ABI/testing/sysfs-bus-papr-pmem index 8316c33862a04..92e2db0e2d3de 100644 --- a/Documentation/ABI/testing/sysfs-bus-papr-pmem +++ b/Documentation/ABI/testing/sysfs-bus-papr-pmem @@ -1,7 +1,7 @@ What: /sys/bus/nd/devices/nmemX/papr/flags Date: Apr, 2020 KernelVersion: v5.8 -Contact: linuxppc-dev , linux-nvdimm@lists.01.org, +Contact: linuxppc-dev , nvdimm@lists.linux.dev, Description: (RO) Report flags indicating various states of a papr-pmem NVDIMM device. Each flag maps to a one or @@ -36,7 +36,7 @@ Description: What: /sys/bus/nd/devices/nmemX/papr/perf_stats Date: May, 2020 KernelVersion: v5.9 -Contact: linuxppc-dev , linux-nvdimm@lists.01.org, +Contact: linuxppc-dev , nvdimm@lists.linux.dev, Description: (RO) Report various performance stats related to papr-scm NVDIMM device. Each stat is reported on a new line with each line diff --git a/Documentation/driver-api/nvdimm/nvdimm.rst b/Documentation/driver-api/nvdimm/nvdimm.rst index ef6d59e0978e7..1d8302b89bd47 100644 --- a/Documentation/driver-api/nvdimm/nvdimm.rst +++ b/Documentation/driver-api/nvdimm/nvdimm.rst @@ -4,7 +4,7 @@ LIBNVDIMM: Non-Volatile Devices libnvdimm - kernel / libndctl - userspace helper library -linux-nvdimm@lists.01.org +nvdimm@lists.linux.dev Version 13 diff --git a/MAINTAINERS b/MAINTAINERS index bd7aff0c120f2..6b5d489022ea8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5237,7 +5237,7 @@ DEVICE DIRECT ACCESS (DAX) M: Dan Williams M: Vishal Verma M: Dave Jiang -L: linux-nvdimm@lists.01.org +L: nvdimm@lists.linux.dev S: Supported F: drivers/dax/ @@ -7006,7 +7006,7 @@ M: Dan Williams R: Matthew Wilcox R: Jan Kara L: linux-fsdevel@vger.kernel.org -L: linux-nvdimm@lists.01.org +L: nvdimm@lists.linux.dev S: Supported F: fs/dax.c F: include/linux/dax.h @@ -10378,7 +10378,7 @@ LIBNVDIMM BLK: MMIO-APERTURE DRIVER M: Dan Williams M: Vishal Verma M: Dave Jiang -L: linux-nvdimm@lists.01.org +L: nvdimm@lists.linux.dev S: Supported Q: https://patchwork.kernel.org/project/linux-nvdimm/list/ P: Documentation/nvdimm/maintainer-entry-profile.rst @@ -10389,7 +10389,7 @@ LIBNVDIMM BTT: BLOCK TRANSLATION TABLE M: Vishal Verma M: Dan Williams M: Dave Jiang -L: linux-nvdimm@lists.01.org +L: nvdimm@lists.linux.dev S: Supported Q: https://patchwork.kernel.org/project/linux-nvdimm/list/ P: Documentation/nvdimm/maintainer-entry-profile.rst @@ -10399,7 +10399,7 @@ LIBNVDIMM PMEM: PERSISTENT MEMORY DRIVER M: Dan Williams M: Vishal Verma M: Dave Jiang -L: linux-nvdimm@lists.01.org +L: nvdimm@lists.linux.dev S: Supported Q: https://patchwork.kernel.org/project/linux-nvdimm/list/ P: Documentation/nvdimm/maintainer-entry-profile.rst @@ -10407,7 +10407,7 @@ F: drivers/nvdimm/pmem* LIBNVDIMM: DEVICETREE BINDINGS M: Oliver O'Halloran -L: linux-nvdimm@lists.01.org +L: nvdimm@lists.linux.dev S: Supported Q: https://patchwork.kernel.org/project/linux-nvdimm/list/ F: Documentation/devicetree/bindings/pmem/pmem-region.txt @@ -10418,7 +10418,7 @@ M: Dan Williams M: Vishal Verma M: Dave Jiang M: Ira Weiny -L: linux-nvdimm@lists.01.org +L: nvdimm@lists.linux.dev S: Supported Q: https://patchwork.kernel.org/project/linux-nvdimm/list/ P: Documentation/nvdimm/maintainer-entry-profile.rst From e9cfd259c6d386f6235395a13bd4f357a979b2d0 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 7 May 2021 00:33:50 -0700 Subject: [PATCH 205/263] ACPI: NFIT: Fix support for variable 'SPA' structure size ACPI 6.4 introduced the "SpaLocationCookie" to the NFIT "System Physical Address (SPA) Range Structure". The presence of that new field is indicated by the ACPI_NFIT_LOCATION_COOKIE_VALID flag. Pre-ACPI-6.4 firmware implementations omit the flag and maintain the original size of the structure. Update the implementation to check that flag to determine the size rather than the ACPI 6.4 compliant definition of 'struct acpi_nfit_system_address' from the Linux ACPICA definitions. Update the test infrastructure for the new expectations as well, i.e. continue to emulate the ACPI 6.3 definition of that structure. Without this fix the kernel fails to validate 'SPA' structures and this leads to a crash in nfit_get_smbios_id() since that routine assumes that SPAs are valid if it finds valid SMBIOS tables. BUG: unable to handle page fault for address: ffffffffffffffa8 [..] Call Trace: skx_get_nvdimm_info+0x56/0x130 [skx_edac] skx_get_dimm_config+0x1f5/0x213 [skx_edac] skx_register_mci+0x132/0x1c0 [skx_edac] Cc: Bob Moore Cc: Erik Kaneda Fixes: cf16b05c607b ("ACPICA: ACPI 6.4: NFIT: add Location Cookie field") Reported-by: Yi Zhang Tested-by: Yi Zhang Reviewed-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/162037273007.1195827.10907249070709169329.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- drivers/acpi/nfit/core.c | 15 +++++++++--- tools/testing/nvdimm/test/nfit.c | 42 +++++++++++++++++++------------- 2 files changed, 36 insertions(+), 21 deletions(-) diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index 958aaac869e8d..23d9a09d70604 100644 --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -686,6 +686,13 @@ int nfit_spa_type(struct acpi_nfit_system_address *spa) return -1; } +static size_t sizeof_spa(struct acpi_nfit_system_address *spa) +{ + if (spa->flags & ACPI_NFIT_LOCATION_COOKIE_VALID) + return sizeof(*spa); + return sizeof(*spa) - 8; +} + static bool add_spa(struct acpi_nfit_desc *acpi_desc, struct nfit_table_prev *prev, struct acpi_nfit_system_address *spa) @@ -693,22 +700,22 @@ static bool add_spa(struct acpi_nfit_desc *acpi_desc, struct device *dev = acpi_desc->dev; struct nfit_spa *nfit_spa; - if (spa->header.length != sizeof(*spa)) + if (spa->header.length != sizeof_spa(spa)) return false; list_for_each_entry(nfit_spa, &prev->spas, list) { - if (memcmp(nfit_spa->spa, spa, sizeof(*spa)) == 0) { + if (memcmp(nfit_spa->spa, spa, sizeof_spa(spa)) == 0) { list_move_tail(&nfit_spa->list, &acpi_desc->spas); return true; } } - nfit_spa = devm_kzalloc(dev, sizeof(*nfit_spa) + sizeof(*spa), + nfit_spa = devm_kzalloc(dev, sizeof(*nfit_spa) + sizeof_spa(spa), GFP_KERNEL); if (!nfit_spa) return false; INIT_LIST_HEAD(&nfit_spa->list); - memcpy(nfit_spa->spa, spa, sizeof(*spa)); + memcpy(nfit_spa->spa, spa, sizeof_spa(spa)); list_add_tail(&nfit_spa->list, &acpi_desc->spas); dev_dbg(dev, "spa index: %d type: %s\n", spa->range_index, diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c index 9b185bf82da87..54f367cbadaee 100644 --- a/tools/testing/nvdimm/test/nfit.c +++ b/tools/testing/nvdimm/test/nfit.c @@ -1871,9 +1871,16 @@ static void smart_init(struct nfit_test *t) } } +static size_t sizeof_spa(struct acpi_nfit_system_address *spa) +{ + /* until spa location cookie support is added... */ + return sizeof(*spa) - 8; +} + static int nfit_test0_alloc(struct nfit_test *t) { - size_t nfit_size = sizeof(struct acpi_nfit_system_address) * NUM_SPA + struct acpi_nfit_system_address *spa = NULL; + size_t nfit_size = sizeof_spa(spa) * NUM_SPA + sizeof(struct acpi_nfit_memory_map) * NUM_MEM + sizeof(struct acpi_nfit_control_region) * NUM_DCR + offsetof(struct acpi_nfit_control_region, @@ -1937,7 +1944,8 @@ static int nfit_test0_alloc(struct nfit_test *t) static int nfit_test1_alloc(struct nfit_test *t) { - size_t nfit_size = sizeof(struct acpi_nfit_system_address) * 2 + struct acpi_nfit_system_address *spa = NULL; + size_t nfit_size = sizeof_spa(spa) * 2 + sizeof(struct acpi_nfit_memory_map) * 2 + offsetof(struct acpi_nfit_control_region, window_size) * 2; int i; @@ -2000,7 +2008,7 @@ static void nfit_test0_setup(struct nfit_test *t) */ spa = nfit_buf; spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; - spa->header.length = sizeof(*spa); + spa->header.length = sizeof_spa(spa); memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_PM), 16); spa->range_index = 0+1; spa->address = t->spa_set_dma[0]; @@ -2014,7 +2022,7 @@ static void nfit_test0_setup(struct nfit_test *t) */ spa = nfit_buf + offset; spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; - spa->header.length = sizeof(*spa); + spa->header.length = sizeof_spa(spa); memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_PM), 16); spa->range_index = 1+1; spa->address = t->spa_set_dma[1]; @@ -2024,7 +2032,7 @@ static void nfit_test0_setup(struct nfit_test *t) /* spa2 (dcr0) dimm0 */ spa = nfit_buf + offset; spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; - spa->header.length = sizeof(*spa); + spa->header.length = sizeof_spa(spa); memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_DCR), 16); spa->range_index = 2+1; spa->address = t->dcr_dma[0]; @@ -2034,7 +2042,7 @@ static void nfit_test0_setup(struct nfit_test *t) /* spa3 (dcr1) dimm1 */ spa = nfit_buf + offset; spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; - spa->header.length = sizeof(*spa); + spa->header.length = sizeof_spa(spa); memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_DCR), 16); spa->range_index = 3+1; spa->address = t->dcr_dma[1]; @@ -2044,7 +2052,7 @@ static void nfit_test0_setup(struct nfit_test *t) /* spa4 (dcr2) dimm2 */ spa = nfit_buf + offset; spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; - spa->header.length = sizeof(*spa); + spa->header.length = sizeof_spa(spa); memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_DCR), 16); spa->range_index = 4+1; spa->address = t->dcr_dma[2]; @@ -2054,7 +2062,7 @@ static void nfit_test0_setup(struct nfit_test *t) /* spa5 (dcr3) dimm3 */ spa = nfit_buf + offset; spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; - spa->header.length = sizeof(*spa); + spa->header.length = sizeof_spa(spa); memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_DCR), 16); spa->range_index = 5+1; spa->address = t->dcr_dma[3]; @@ -2064,7 +2072,7 @@ static void nfit_test0_setup(struct nfit_test *t) /* spa6 (bdw for dcr0) dimm0 */ spa = nfit_buf + offset; spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; - spa->header.length = sizeof(*spa); + spa->header.length = sizeof_spa(spa); memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_BDW), 16); spa->range_index = 6+1; spa->address = t->dimm_dma[0]; @@ -2074,7 +2082,7 @@ static void nfit_test0_setup(struct nfit_test *t) /* spa7 (bdw for dcr1) dimm1 */ spa = nfit_buf + offset; spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; - spa->header.length = sizeof(*spa); + spa->header.length = sizeof_spa(spa); memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_BDW), 16); spa->range_index = 7+1; spa->address = t->dimm_dma[1]; @@ -2084,7 +2092,7 @@ static void nfit_test0_setup(struct nfit_test *t) /* spa8 (bdw for dcr2) dimm2 */ spa = nfit_buf + offset; spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; - spa->header.length = sizeof(*spa); + spa->header.length = sizeof_spa(spa); memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_BDW), 16); spa->range_index = 8+1; spa->address = t->dimm_dma[2]; @@ -2094,7 +2102,7 @@ static void nfit_test0_setup(struct nfit_test *t) /* spa9 (bdw for dcr3) dimm3 */ spa = nfit_buf + offset; spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; - spa->header.length = sizeof(*spa); + spa->header.length = sizeof_spa(spa); memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_BDW), 16); spa->range_index = 9+1; spa->address = t->dimm_dma[3]; @@ -2581,7 +2589,7 @@ static void nfit_test0_setup(struct nfit_test *t) /* spa10 (dcr4) dimm4 */ spa = nfit_buf + offset; spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; - spa->header.length = sizeof(*spa); + spa->header.length = sizeof_spa(spa); memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_DCR), 16); spa->range_index = 10+1; spa->address = t->dcr_dma[4]; @@ -2595,7 +2603,7 @@ static void nfit_test0_setup(struct nfit_test *t) */ spa = nfit_buf + offset; spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; - spa->header.length = sizeof(*spa); + spa->header.length = sizeof_spa(spa); memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_PM), 16); spa->range_index = 11+1; spa->address = t->spa_set_dma[2]; @@ -2605,7 +2613,7 @@ static void nfit_test0_setup(struct nfit_test *t) /* spa12 (bdw for dcr4) dimm4 */ spa = nfit_buf + offset; spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; - spa->header.length = sizeof(*spa); + spa->header.length = sizeof_spa(spa); memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_BDW), 16); spa->range_index = 12+1; spa->address = t->dimm_dma[4]; @@ -2739,7 +2747,7 @@ static void nfit_test1_setup(struct nfit_test *t) /* spa0 (flat range with no bdw aliasing) */ spa = nfit_buf + offset; spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; - spa->header.length = sizeof(*spa); + spa->header.length = sizeof_spa(spa); memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_PM), 16); spa->range_index = 0+1; spa->address = t->spa_set_dma[0]; @@ -2749,7 +2757,7 @@ static void nfit_test1_setup(struct nfit_test *t) /* virtual cd region */ spa = nfit_buf + offset; spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; - spa->header.length = sizeof(*spa); + spa->header.length = sizeof_spa(spa); memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_VCD), 16); spa->range_index = 0; spa->address = t->spa_set_dma[1]; From a554e740b66a83c7560b30e6b50bece37555ced3 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Thu, 22 Apr 2021 12:04:42 -0700 Subject: [PATCH 206/263] x86/boot/compressed: Enable -Wundef A discussion around -Wundef showed that there were still a few boolean Kconfigs where #if was used rather than #ifdef to guard different code. Kconfig doesn't define boolean configs, which can result in -Wundef warnings. arch/x86/boot/compressed/Makefile resets the CFLAGS used for this directory, and doesn't re-enable -Wundef as the top level Makefile does. If re-added, with RANDOMIZE_BASE and X86_NEED_RELOCS disabled, the following warnings are visible. arch/x86/boot/compressed/misc.h:82:5: warning: 'CONFIG_RANDOMIZE_BASE' is not defined, evaluates to 0 [-Wundef] ^ arch/x86/boot/compressed/misc.c:175:5: warning: 'CONFIG_X86_NEED_RELOCS' is not defined, evaluates to 0 [-Wundef] ^ Simply fix these and re-enable this warning for this directory. Suggested-by: Nathan Chancellor Suggested-by: Joe Perches Signed-off-by: Nick Desaulniers Signed-off-by: Ingo Molnar Acked-by: Kees Cook Reviewed-by: Nathan Chancellor Link: https://lore.kernel.org/r/20210422190450.3903999-1-ndesaulniers@google.com --- arch/x86/boot/compressed/Makefile | 1 + arch/x86/boot/compressed/misc.c | 2 +- arch/x86/boot/compressed/misc.h | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 2a2975236c9e3..431bf7f846c3c 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -30,6 +30,7 @@ targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \ KBUILD_CFLAGS := -m$(BITS) -O2 KBUILD_CFLAGS += -fno-strict-aliasing -fPIE +KBUILD_CFLAGS += -Wundef KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING cflags-$(CONFIG_X86_32) := -march=i386 cflags-$(CONFIG_X86_64) := -mcmodel=small -mno-red-zone diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index dde042f64ccaa..743f13ea25c12 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -172,7 +172,7 @@ void __puthex(unsigned long value) } } -#if CONFIG_X86_NEED_RELOCS +#ifdef CONFIG_X86_NEED_RELOCS static void handle_relocations(void *output, unsigned long output_len, unsigned long virt_addr) { diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index e5612f035498c..31139256859fc 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -79,7 +79,7 @@ struct mem_vector { u64 size; }; -#if CONFIG_RANDOMIZE_BASE +#ifdef CONFIG_RANDOMIZE_BASE /* kaslr.c */ void choose_random_location(unsigned long input, unsigned long input_size, From 3b5169c2eb81e822445469a077223f8eb0729a59 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Wed, 12 May 2021 15:48:09 -0700 Subject: [PATCH 207/263] hwmon: (adm9240) Fix writes into inX_max attributes When converting the driver to use the devm_hwmon_device_register_with_info API, the wrong register was selected when writing into inX_max attributes. Fix it. Fixes: 124b7e34a5a6 ("hwmon: (adm9240) Convert to devm_hwmon_device_register_with_info API") Reported-by: Chris Packham Tested-by: Chris Packham Signed-off-by: Guenter Roeck --- drivers/hwmon/adm9240.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hwmon/adm9240.c b/drivers/hwmon/adm9240.c index 5677263bcf0de..483cd757abd33 100644 --- a/drivers/hwmon/adm9240.c +++ b/drivers/hwmon/adm9240.c @@ -485,7 +485,7 @@ static int adm9240_in_write(struct device *dev, u32 attr, int channel, long val) reg = ADM9240_REG_IN_MIN(channel); break; case hwmon_in_max: - reg = ADM9240_REG_IN(channel); + reg = ADM9240_REG_IN_MAX(channel); break; default: return -EOPNOTSUPP; From 0852b6ca941ef3ff75076e85738877bd3271e1cd Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Mon, 10 May 2021 14:47:15 +0800 Subject: [PATCH 208/263] erofs: fix 1 lcluster-sized pcluster for big pcluster If the 1st NONHEAD lcluster of a pcluster isn't CBLKCNT lcluster type rather than a HEAD or PLAIN type instead, which means its pclustersize _must_ be 1 lcluster (since its uncompressed size < 2 lclusters), as illustrated below: HEAD HEAD / PLAIN lcluster type ____________ ____________ |_:__________|_________:__| file data (uncompressed) . . .____________. |____________| pcluster data (compressed) Such on-disk case was explained before [1] but missed to be handled properly in the runtime implementation. It can be observed if manually generating 1 lcluster-sized pcluster with 2 lclusters (thus CBLKCNT doesn't exist.) Let's fix it now. [1] https://lore.kernel.org/r/20210407043927.10623-1-xiang@kernel.org Link: https://lore.kernel.org/r/20210510064715.29123-1-xiang@kernel.org Fixes: cec6e93beadf ("erofs: support parsing big pcluster compress indexes") Reviewed-by: Chao Yu Signed-off-by: Gao Xiang --- fs/erofs/zmap.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index e62d813756f28..efaf32596b97f 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -450,14 +450,31 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, lcn = m->lcn + 1; if (m->compressedlcs) goto out; - if (lcn == initial_lcn) - goto err_bonus_cblkcnt; err = z_erofs_load_cluster_from_disk(m, lcn); if (err) return err; + /* + * If the 1st NONHEAD lcluster has already been handled initially w/o + * valid compressedlcs, which means at least it mustn't be CBLKCNT, or + * an internal implemenatation error is detected. + * + * The following code can also handle it properly anyway, but let's + * BUG_ON in the debugging mode only for developers to notice that. + */ + DBG_BUGON(lcn == initial_lcn && + m->type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD); + switch (m->type) { + case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD: + /* + * if the 1st NONHEAD lcluster is actually PLAIN or HEAD type + * rather than CBLKCNT, it's a 1 lcluster-sized pcluster. + */ + m->compressedlcs = 1; + break; case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD: if (m->delta[0] != 1) goto err_bonus_cblkcnt; From 3743d55b289c203d8f77b7cd47c24926b9d186ae Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Sun, 25 Apr 2021 15:34:51 +0800 Subject: [PATCH 209/263] x86, sched: Fix the AMD CPPC maximum performance value on certain AMD Ryzen generations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some AMD Ryzen generations has different calculation method on maximum performance. 255 is not for all ASICs, some specific generations should use 166 as the maximum performance. Otherwise, it will report incorrect frequency value like below: ~ → lscpu | grep MHz CPU MHz: 3400.000 CPU max MHz: 7228.3198 CPU min MHz: 2200.0000 [ mingo: Tidied up whitespace use. ] [ Alexander Monakov : fix 225 -> 255 typo. ] Fixes: 41ea667227ba ("x86, sched: Calculate frequency invariance for AMD systems") Fixes: 3c55e94c0ade ("cpufreq: ACPI: Extend frequency tables to cover boost frequencies") Reported-by: Jason Bagavatsingham Fixed-by: Alexander Monakov Reviewed-by: Rafael J. Wysocki Signed-off-by: Huang Rui Signed-off-by: Ingo Molnar Tested-by: Jason Bagavatsingham Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210425073451.2557394-1-ray.huang@amd.com Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=211791 Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 2 ++ arch/x86/kernel/cpu/amd.c | 16 ++++++++++++++++ arch/x86/kernel/smpboot.c | 2 +- drivers/cpufreq/acpi-cpufreq.c | 6 +++++- 4 files changed, 24 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 154321d29050f..556b2b17c3e2f 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -787,8 +787,10 @@ DECLARE_PER_CPU(u64, msr_misc_features_shadow); #ifdef CONFIG_CPU_SUP_AMD extern u32 amd_get_nodes_per_socket(void); +extern u32 amd_get_highest_perf(void); #else static inline u32 amd_get_nodes_per_socket(void) { return 0; } +static inline u32 amd_get_highest_perf(void) { return 0; } #endif static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 2d11384dc9ab4..6d7b3b3ea80b1 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -1165,3 +1165,19 @@ void set_dr_addr_mask(unsigned long mask, int dr) break; } } + +u32 amd_get_highest_perf(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + if (c->x86 == 0x17 && ((c->x86_model >= 0x30 && c->x86_model < 0x40) || + (c->x86_model >= 0x70 && c->x86_model < 0x80))) + return 166; + + if (c->x86 == 0x19 && ((c->x86_model >= 0x20 && c->x86_model < 0x30) || + (c->x86_model >= 0x40 && c->x86_model < 0x70))) + return 166; + + return 255; +} +EXPORT_SYMBOL_GPL(amd_get_highest_perf); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 0ad5214f598a9..7770245cc7fa7 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -2043,7 +2043,7 @@ static bool amd_set_max_freq_ratio(void) return false; } - highest_perf = perf_caps.highest_perf; + highest_perf = amd_get_highest_perf(); nominal_perf = perf_caps.nominal_perf; if (!highest_perf || !nominal_perf) { diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c index d1bbc16fba4b4..7e7450453714d 100644 --- a/drivers/cpufreq/acpi-cpufreq.c +++ b/drivers/cpufreq/acpi-cpufreq.c @@ -646,7 +646,11 @@ static u64 get_max_boost_ratio(unsigned int cpu) return 0; } - highest_perf = perf_caps.highest_perf; + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + highest_perf = amd_get_highest_perf(); + else + highest_perf = perf_caps.highest_perf; + nominal_perf = perf_caps.nominal_perf; if (!highest_perf || !nominal_perf) { From b813511135e8b84fa741afdfbab4937919100bef Mon Sep 17 00:00:00 2001 From: Abhijeet Rao Date: Wed, 12 May 2021 11:08:12 +0300 Subject: [PATCH 210/263] xhci-pci: Allow host runtime PM as default for Intel Alder Lake xHCI In the same way as Intel Tiger Lake TCSS (Type-C Subsystem) the Alder Lake TCSS xHCI needs to be runtime suspended whenever possible to allow the TCSS hardware block to enter D3cold and thus save energy. Cc: stable@vger.kernel.org Signed-off-by: Abhijeet Rao Signed-off-by: Nikunj A. Dadhania Signed-off-by: Azhar Shaikh Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20210512080816.866037-2-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-pci.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c index 5bbccc9a0179f..a858add8436c5 100644 --- a/drivers/usb/host/xhci-pci.c +++ b/drivers/usb/host/xhci-pci.c @@ -57,6 +57,7 @@ #define PCI_DEVICE_ID_INTEL_CML_XHCI 0xa3af #define PCI_DEVICE_ID_INTEL_TIGER_LAKE_XHCI 0x9a13 #define PCI_DEVICE_ID_INTEL_MAPLE_RIDGE_XHCI 0x1138 +#define PCI_DEVICE_ID_INTEL_ALDER_LAKE_XHCI 0x461e #define PCI_DEVICE_ID_AMD_PROMONTORYA_4 0x43b9 #define PCI_DEVICE_ID_AMD_PROMONTORYA_3 0x43ba @@ -243,7 +244,8 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci) pdev->device == PCI_DEVICE_ID_INTEL_TITAN_RIDGE_DD_XHCI || pdev->device == PCI_DEVICE_ID_INTEL_ICE_LAKE_XHCI || pdev->device == PCI_DEVICE_ID_INTEL_TIGER_LAKE_XHCI || - pdev->device == PCI_DEVICE_ID_INTEL_MAPLE_RIDGE_XHCI)) + pdev->device == PCI_DEVICE_ID_INTEL_MAPLE_RIDGE_XHCI || + pdev->device == PCI_DEVICE_ID_INTEL_ALDER_LAKE_XHCI)) xhci->quirks |= XHCI_DEFAULT_PM_RUNTIME_ALLOW; if (pdev->vendor == PCI_VENDOR_ID_ETRON && From 9b6a126ae58d9edfdde2d5f2e87f7615ea5e0155 Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Wed, 12 May 2021 11:08:13 +0300 Subject: [PATCH 211/263] xhci: Fix giving back cancelled URBs even if halted endpoint can't reset Commit 9ebf30007858 ("xhci: Fix halted endpoint at stop endpoint command completion") in 5.12 changes how cancelled URBs are given back. To cancel a URB xhci driver needs to stop the endpoint first. To clear a halted endpoint xhci driver needs to reset the endpoint. In rare cases when an endpoint halt (error) races with a endpoint stop we need to clear the reset before removing, and giving back the cancelled URB. The above change in 5.12 takes care of this, but it also relies on the reset endpoint completion handler to give back the cancelled URBs. There are cases when driver refuses to queue reset endpoint commands, for example when a link suddenly goes to an inactive error state. In this case the cancelled URB is never given back. Fix this by giving back the URB in the stop endpoint if queuing a reset endpoint command fails. Fixes: 9ebf30007858 ("xhci: Fix halted endpoint at stop endpoint command completion") CC: # 5.12 Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20210512080816.866037-3-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-ring.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c index 05c38dd3ee361..a8e4189277da8 100644 --- a/drivers/usb/host/xhci-ring.c +++ b/drivers/usb/host/xhci-ring.c @@ -862,7 +862,7 @@ static int xhci_reset_halted_ep(struct xhci_hcd *xhci, unsigned int slot_id, return ret; } -static void xhci_handle_halted_endpoint(struct xhci_hcd *xhci, +static int xhci_handle_halted_endpoint(struct xhci_hcd *xhci, struct xhci_virt_ep *ep, unsigned int stream_id, struct xhci_td *td, enum xhci_ep_reset_type reset_type) @@ -875,7 +875,7 @@ static void xhci_handle_halted_endpoint(struct xhci_hcd *xhci, * Device will be reset soon to recover the link so don't do anything */ if (ep->vdev->flags & VDEV_PORT_ERROR) - return; + return -ENODEV; /* add td to cancelled list and let reset ep handler take care of it */ if (reset_type == EP_HARD_RESET) { @@ -888,16 +888,18 @@ static void xhci_handle_halted_endpoint(struct xhci_hcd *xhci, if (ep->ep_state & EP_HALTED) { xhci_dbg(xhci, "Reset ep command already pending\n"); - return; + return 0; } err = xhci_reset_halted_ep(xhci, slot_id, ep->ep_index, reset_type); if (err) - return; + return err; ep->ep_state |= EP_HALTED; xhci_ring_cmd_db(xhci); + + return 0; } /* @@ -1014,6 +1016,7 @@ static void xhci_handle_cmd_stop_ep(struct xhci_hcd *xhci, int slot_id, struct xhci_td *td = NULL; enum xhci_ep_reset_type reset_type; struct xhci_command *command; + int err; if (unlikely(TRB_TO_SUSPEND_PORT(le32_to_cpu(trb->generic.field[3])))) { if (!xhci->devs[slot_id]) @@ -1058,7 +1061,10 @@ static void xhci_handle_cmd_stop_ep(struct xhci_hcd *xhci, int slot_id, td->status = -EPROTO; } /* reset ep, reset handler cleans up cancelled tds */ - xhci_handle_halted_endpoint(xhci, ep, 0, td, reset_type); + err = xhci_handle_halted_endpoint(xhci, ep, 0, td, + reset_type); + if (err) + break; xhci_stop_watchdog_timer_in_irq(xhci, ep); return; case EP_STATE_RUNNING: From dda32c00c9a0fa103b5d54ef72c477b7aa993679 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Wed, 12 May 2021 11:08:14 +0300 Subject: [PATCH 212/263] xhci: Do not use GFP_KERNEL in (potentially) atomic context 'xhci_urb_enqueue()' is passed a 'mem_flags' argument, because "URBs may be submitted in interrupt context" (see comment related to 'usb_submit_urb()' in 'drivers/usb/core/urb.c') So this flag should be used in all the calling chain. Up to now, 'xhci_check_maxpacket()' which is only called from 'xhci_urb_enqueue()', uses GFP_KERNEL. Be safe and pass the mem_flags to this function as well. Fixes: ddba5cd0aeff ("xhci: Use command structures when queuing commands on the command ring") Cc: Signed-off-by: Christophe JAILLET Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20210512080816.866037-4-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c index ca9385d22f68d..27283654ca080 100644 --- a/drivers/usb/host/xhci.c +++ b/drivers/usb/host/xhci.c @@ -1514,7 +1514,7 @@ static int xhci_configure_endpoint(struct xhci_hcd *xhci, * we need to issue an evaluate context command and wait on it. */ static int xhci_check_maxpacket(struct xhci_hcd *xhci, unsigned int slot_id, - unsigned int ep_index, struct urb *urb) + unsigned int ep_index, struct urb *urb, gfp_t mem_flags) { struct xhci_container_ctx *out_ctx; struct xhci_input_control_ctx *ctrl_ctx; @@ -1545,7 +1545,7 @@ static int xhci_check_maxpacket(struct xhci_hcd *xhci, unsigned int slot_id, * changes max packet sizes. */ - command = xhci_alloc_command(xhci, true, GFP_KERNEL); + command = xhci_alloc_command(xhci, true, mem_flags); if (!command) return -ENOMEM; @@ -1639,7 +1639,7 @@ static int xhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flag */ if (urb->dev->speed == USB_SPEED_FULL) { ret = xhci_check_maxpacket(xhci, slot_id, - ep_index, urb); + ep_index, urb, mem_flags); if (ret < 0) { xhci_urb_free_priv(urb_priv); urb->hcpriv = NULL; From ca09b1bea63ab83f4cca3a2ae8bc4f597ec28851 Mon Sep 17 00:00:00 2001 From: Maximilian Luz Date: Wed, 12 May 2021 11:08:15 +0300 Subject: [PATCH 213/263] usb: xhci: Increase timeout for HC halt On some devices (specifically the SC8180x based Surface Pro X with QCOM04A6) HC halt / xhci_halt() times out during boot. Manually binding the xhci-hcd driver at some point later does not exhibit this behavior. To work around this, double XHCI_MAX_HALT_USEC, which also resolves this issue. Cc: Signed-off-by: Maximilian Luz Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20210512080816.866037-5-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-ext-caps.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/usb/host/xhci-ext-caps.h b/drivers/usb/host/xhci-ext-caps.h index fa59b242cd515..e8af0a125f84b 100644 --- a/drivers/usb/host/xhci-ext-caps.h +++ b/drivers/usb/host/xhci-ext-caps.h @@ -7,8 +7,9 @@ * Author: Sarah Sharp * Some code borrowed from the Linux EHCI driver. */ -/* Up to 16 ms to halt an HC */ -#define XHCI_MAX_HALT_USEC (16*1000) + +/* HC should halt within 16 ms, but use 32 ms as some hosts take longer */ +#define XHCI_MAX_HALT_USEC (32 * 1000) /* HC not running - set to 1 when run/stop bit is cleared. */ #define XHCI_STS_HALT (1<<0) From 3c128781d8da463761495aaf8898c9ecb4e71528 Mon Sep 17 00:00:00 2001 From: Sandeep Singh Date: Wed, 12 May 2021 11:08:16 +0300 Subject: [PATCH 214/263] xhci: Add reset resume quirk for AMD xhci controller. One of AMD xhci controller require reset on resume. Occasionally AMD xhci controller does not respond to Stop endpoint command. Once the issue happens controller goes into bad state and in that case controller needs to be reset. Cc: Signed-off-by: Sandeep Singh Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20210512080816.866037-6-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-pci.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c index a858add8436c5..7bc18cf8042cc 100644 --- a/drivers/usb/host/xhci-pci.c +++ b/drivers/usb/host/xhci-pci.c @@ -167,8 +167,10 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci) (pdev->device == 0x15e0 || pdev->device == 0x15e1)) xhci->quirks |= XHCI_SNPS_BROKEN_SUSPEND; - if (pdev->vendor == PCI_VENDOR_ID_AMD && pdev->device == 0x15e5) + if (pdev->vendor == PCI_VENDOR_ID_AMD && pdev->device == 0x15e5) { xhci->quirks |= XHCI_DISABLE_SPARSE; + xhci->quirks |= XHCI_RESET_ON_RESUME; + } if (pdev->vendor == PCI_VENDOR_ID_AMD) xhci->quirks |= XHCI_TRUST_TX_LENGTH; From 12701ce524bc9b7c6345a2425208501fd2c62aad Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Mon, 10 May 2021 14:17:56 -0700 Subject: [PATCH 215/263] usb: typec: tcpm: Fix SINK_DISCOVERY current limit for Rp-default MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a regression introduced by 1373fefc6243 ("usb: typec: tcpm: Allow slow charging loops to comply to pSnkStby") When Source advertises Rp-default, tcpm would request 500mA when in SINK_DISCOVERY, Type-C spec advises the sink to follow BC1.2 current limits when Rp-default is advertised. [12750.503381] Requesting mux state 1, usb-role 2, orientation 1 [12750.503837] state change SNK_ATTACHED -> SNK_STARTUP [rev3 NONE_AMS] [12751.003891] state change SNK_STARTUP -> SNK_DISCOVERY [12751.003900] Setting voltage/current limit 5000 mV 500 mA This patch restores the behavior where the tcpm would request 0mA when Rp-default is advertised by the source. [   73.174252] Requesting mux state 1, usb-role 2, orientation 1 [   73.174749] state change SNK_ATTACHED -> SNK_STARTUP [rev3 NONE_AMS] [   73.674800] state change SNK_STARTUP -> SNK_DISCOVERY [   73.674808] Setting voltage/current limit 5000 mV 0 mA During SNK_DISCOVERY, Cap the current limit to PD_P_SNK_STDBY_MW / 5 only for slow_charger_loop case. Fixes: 1373fefc6243 ("usb: typec: tcpm: Allow slow charging loops to comply to pSnkStby") Reviewed-by: Guenter Roeck Acked-by: Heikki Krogerus Signed-off-by: Badhri Jagan Sridharan Link: https://lore.kernel.org/r/20210510211756.3346954-1-badhri@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/tcpm/tcpm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c index db567e6fde924..72e4d63a23669 100644 --- a/drivers/usb/typec/tcpm/tcpm.c +++ b/drivers/usb/typec/tcpm/tcpm.c @@ -4085,7 +4085,7 @@ static void run_state_machine(struct tcpm_port *port) if (port->vbus_present) { u32 current_lim = tcpm_get_current_limit(port); - if (port->slow_charger_loop || (current_lim > PD_P_SNK_STDBY_MW / 5)) + if (port->slow_charger_loop && (current_lim > PD_P_SNK_STDBY_MW / 5)) current_lim = PD_P_SNK_STDBY_MW / 5; tcpm_set_current_limit(port, current_lim, 5000); tcpm_set_charge(port, true); From 975f94c7d6c306b833628baa9aec3f79db1eb3a1 Mon Sep 17 00:00:00 2001 From: Chunfeng Yun Date: Wed, 12 May 2021 10:07:38 +0800 Subject: [PATCH 216/263] usb: core: hub: fix race condition about TRSMRCY of resume This may happen if the port becomes resume status exactly when usb_port_resume() gets port status, it still need provide a TRSMCRY time before access the device. CC: Reported-by: Tianping Fang Acked-by: Alan Stern Signed-off-by: Chunfeng Yun Link: https://lore.kernel.org/r/20210512020738.52961-1-chunfeng.yun@mediatek.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/hub.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index b2bc4b7c42895..fc7d6cdacf16b 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -3642,9 +3642,6 @@ int usb_port_resume(struct usb_device *udev, pm_message_t msg) * sequence. */ status = hub_port_status(hub, port1, &portstatus, &portchange); - - /* TRSMRCY = 10 msec */ - msleep(10); } SuspendCleared: @@ -3659,6 +3656,9 @@ int usb_port_resume(struct usb_device *udev, pm_message_t msg) usb_clear_port_feature(hub->hdev, port1, USB_PORT_FEAT_C_SUSPEND); } + + /* TRSMRCY = 10 msec */ + msleep(10); } if (udev->persist_enabled) From e181811bd04d874fe48bbfa1165a82068b58144d Mon Sep 17 00:00:00 2001 From: Hou Pu Date: Thu, 13 May 2021 21:04:10 +0800 Subject: [PATCH 217/263] nvmet: use new ana_log_size instead the old one The new ana_log_size should be used instead of the old one. Or kernel NULL pointer dereference will happen like below: [ 38.957849][ T69] BUG: kernel NULL pointer dereference, address: 000000000000003c [ 38.975550][ T69] #PF: supervisor write access in kernel mode [ 38.975955][ T69] #PF: error_code(0x0002) - not-present page [ 38.976905][ T69] PGD 0 P4D 0 [ 38.979388][ T69] Oops: 0002 [#1] SMP NOPTI [ 38.980488][ T69] CPU: 0 PID: 69 Comm: kworker/0:2 Not tainted 5.12.0+ #54 [ 38.981254][ T69] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 [ 38.982502][ T69] Workqueue: events nvme_loop_execute_work [ 38.985219][ T69] RIP: 0010:memcpy_orig+0x68/0x10f [ 38.986203][ T69] Code: 83 c2 20 eb 44 48 01 d6 48 01 d7 48 83 ea 20 0f 1f 00 48 83 ea 20 4c 8b 46 f8 4c 8b 4e f0 4c 8b 56 e8 4c 8b 5e e0 48 8d 76 e0 <4c> 89 47 f8 4c 89 4f f0 4c 89 57 e8 4c 89 5f e0 48 8d 7f e0 73 d2 [ 38.987677][ T69] RSP: 0018:ffffc900001b7d48 EFLAGS: 00000287 [ 38.987996][ T69] RAX: 0000000000000020 RBX: 0000000000000024 RCX: 0000000000000010 [ 38.988327][ T69] RDX: ffffffffffffffe4 RSI: ffff8881084bc004 RDI: 0000000000000044 [ 38.988620][ T69] RBP: 0000000000000024 R08: 0000000100000000 R09: 0000000000000000 [ 38.988991][ T69] R10: 0000000100000000 R11: 0000000000000001 R12: 0000000000000024 [ 38.989289][ T69] R13: ffff8881084bc000 R14: 0000000000000000 R15: 0000000000000024 [ 38.989845][ T69] FS: 0000000000000000(0000) GS:ffff888237c00000(0000) knlGS:0000000000000000 [ 38.990234][ T69] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 38.990490][ T69] CR2: 000000000000003c CR3: 00000001085b2000 CR4: 00000000000006f0 [ 38.991105][ T69] Call Trace: [ 38.994157][ T69] sg_copy_buffer+0xb8/0xf0 [ 38.995357][ T69] nvmet_copy_to_sgl+0x48/0x6d [ 38.995565][ T69] nvmet_execute_get_log_page_ana+0xd4/0x1cb [ 38.995792][ T69] nvmet_execute_get_log_page+0xc9/0x146 [ 38.995992][ T69] nvme_loop_execute_work+0x3e/0x44 [ 38.996181][ T69] process_one_work+0x1c3/0x3c0 [ 38.996393][ T69] worker_thread+0x44/0x3d0 [ 38.996600][ T69] ? cancel_delayed_work+0x90/0x90 [ 38.996804][ T69] kthread+0xf7/0x130 [ 38.996961][ T69] ? kthread_create_worker_on_cpu+0x70/0x70 [ 38.997171][ T69] ret_from_fork+0x22/0x30 [ 38.997705][ T69] Modules linked in: [ 38.998741][ T69] CR2: 000000000000003c [ 39.000104][ T69] ---[ end trace e719927b609d0fa0 ]--- Fixes: 5e1f689913a4 ("nvme-multipath: fix double initialization of ANA state") Signed-off-by: Hou Pu Signed-off-by: Christoph Hellwig --- drivers/nvme/host/multipath.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index deb14562c96ae..f81871c7128a0 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -817,7 +817,7 @@ int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) if (ana_log_size > ctrl->ana_log_size) { nvme_mpath_stop(ctrl); kfree(ctrl->ana_log_buf); - ctrl->ana_log_buf = kmalloc(ctrl->ana_log_size, GFP_KERNEL); + ctrl->ana_log_buf = kmalloc(ana_log_size, GFP_KERNEL); if (!ctrl->ana_log_buf) return -ENOMEM; } From 5d31950a483381b5444494dfb7fa5ed764193b92 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Sun, 9 May 2021 17:49:26 -0500 Subject: [PATCH 218/263] drm/radeon/ni_dpm: Fix booting bug Create new structure NISLANDS_SMC_SWSTATE_SINGLE, as initialState.levels and ACPIState.levels are never actually used as flexible arrays. Those arrays can be used as simple objects of type NISLANDS_SMC_HW_PERFORMANCE_LEVEL, instead. Currently, the code fails because flexible array _levels_ in struct NISLANDS_SMC_SWSTATE doesn't allow for code that access the first element of initialState.levels and ACPIState.levels arrays: drivers/gpu/drm/radeon/ni_dpm.c: 1690 table->initialState.levels[0].mclk.vMPLL_AD_FUNC_CNTL = 1691 cpu_to_be32(ni_pi->clock_registers.mpll_ad_func_cntl); ... 1903: table->ACPIState.levels[0].mclk.vMPLL_AD_FUNC_CNTL = cpu_to_be32(mpll_ad_func_cntl); 1904: table->ACPIState.levels[0].mclk.vMPLL_AD_FUNC_CNTL_2 = cpu_to_be32(mpll_ad_func_cntl_2); because such element cannot exist without previously allocating any dynamic memory for it (which never actually happens). That's why struct NISLANDS_SMC_SWSTATE should only be used as type for object driverState and new struct SISLANDS_SMC_SWSTATE_SINGLE is created as type for objects initialState, ACPIState and ULVState. Also, with the change from one-element array to flexible-array member in commit 434fb1e7444a ("drm/radeon/nislands_smc.h: Replace one-element array with flexible-array member in struct NISLANDS_SMC_SWSTATE"), the size of dpmLevels in struct NISLANDS_SMC_STATETABLE should be fixed to be NISLANDS_MAX_SMC_PERFORMANCE_LEVELS_PER_SWSTATE instead of NISLANDS_MAX_SMC_PERFORMANCE_LEVELS_PER_SWSTATE - 1. Bug: https://lore.kernel.org/dri-devel/3eedbe78-1fbd-4763-a7f3-ac5665e76a4a@xenosoft.de/ Fixes: 434fb1e7444a ("drm/radeon/nislands_smc.h: Replace one-element array with flexible-array member in struct NISLANDS_SMC_SWSTATE") Cc: stable@vger.kernel.org Reported-by: Christian Zigotzky Tested-by: Christian Zigotzky Link: https://lore.kernel.org/dri-devel/9bb5fcbd-daf5-1669-b3e7-b8624b3c36f9@xenosoft.de/ Signed-off-by: Gustavo A. R. Silva Signed-off-by: Alex Deucher --- drivers/gpu/drm/radeon/ni_dpm.c | 144 +++++++++++++------------- drivers/gpu/drm/radeon/nislands_smc.h | 34 +++--- 2 files changed, 94 insertions(+), 84 deletions(-) diff --git a/drivers/gpu/drm/radeon/ni_dpm.c b/drivers/gpu/drm/radeon/ni_dpm.c index dd5ef64937230..769f666335ac4 100644 --- a/drivers/gpu/drm/radeon/ni_dpm.c +++ b/drivers/gpu/drm/radeon/ni_dpm.c @@ -1687,102 +1687,102 @@ static int ni_populate_smc_initial_state(struct radeon_device *rdev, u32 reg; int ret; - table->initialState.levels[0].mclk.vMPLL_AD_FUNC_CNTL = + table->initialState.level.mclk.vMPLL_AD_FUNC_CNTL = cpu_to_be32(ni_pi->clock_registers.mpll_ad_func_cntl); - table->initialState.levels[0].mclk.vMPLL_AD_FUNC_CNTL_2 = + table->initialState.level.mclk.vMPLL_AD_FUNC_CNTL_2 = cpu_to_be32(ni_pi->clock_registers.mpll_ad_func_cntl_2); - table->initialState.levels[0].mclk.vMPLL_DQ_FUNC_CNTL = + table->initialState.level.mclk.vMPLL_DQ_FUNC_CNTL = cpu_to_be32(ni_pi->clock_registers.mpll_dq_func_cntl); - table->initialState.levels[0].mclk.vMPLL_DQ_FUNC_CNTL_2 = + table->initialState.level.mclk.vMPLL_DQ_FUNC_CNTL_2 = cpu_to_be32(ni_pi->clock_registers.mpll_dq_func_cntl_2); - table->initialState.levels[0].mclk.vMCLK_PWRMGT_CNTL = + table->initialState.level.mclk.vMCLK_PWRMGT_CNTL = cpu_to_be32(ni_pi->clock_registers.mclk_pwrmgt_cntl); - table->initialState.levels[0].mclk.vDLL_CNTL = + table->initialState.level.mclk.vDLL_CNTL = cpu_to_be32(ni_pi->clock_registers.dll_cntl); - table->initialState.levels[0].mclk.vMPLL_SS = + table->initialState.level.mclk.vMPLL_SS = cpu_to_be32(ni_pi->clock_registers.mpll_ss1); - table->initialState.levels[0].mclk.vMPLL_SS2 = + table->initialState.level.mclk.vMPLL_SS2 = cpu_to_be32(ni_pi->clock_registers.mpll_ss2); - table->initialState.levels[0].mclk.mclk_value = + table->initialState.level.mclk.mclk_value = cpu_to_be32(initial_state->performance_levels[0].mclk); - table->initialState.levels[0].sclk.vCG_SPLL_FUNC_CNTL = + table->initialState.level.sclk.vCG_SPLL_FUNC_CNTL = cpu_to_be32(ni_pi->clock_registers.cg_spll_func_cntl); - table->initialState.levels[0].sclk.vCG_SPLL_FUNC_CNTL_2 = + table->initialState.level.sclk.vCG_SPLL_FUNC_CNTL_2 = cpu_to_be32(ni_pi->clock_registers.cg_spll_func_cntl_2); - table->initialState.levels[0].sclk.vCG_SPLL_FUNC_CNTL_3 = + table->initialState.level.sclk.vCG_SPLL_FUNC_CNTL_3 = cpu_to_be32(ni_pi->clock_registers.cg_spll_func_cntl_3); - table->initialState.levels[0].sclk.vCG_SPLL_FUNC_CNTL_4 = + table->initialState.level.sclk.vCG_SPLL_FUNC_CNTL_4 = cpu_to_be32(ni_pi->clock_registers.cg_spll_func_cntl_4); - table->initialState.levels[0].sclk.vCG_SPLL_SPREAD_SPECTRUM = + table->initialState.level.sclk.vCG_SPLL_SPREAD_SPECTRUM = cpu_to_be32(ni_pi->clock_registers.cg_spll_spread_spectrum); - table->initialState.levels[0].sclk.vCG_SPLL_SPREAD_SPECTRUM_2 = + table->initialState.level.sclk.vCG_SPLL_SPREAD_SPECTRUM_2 = cpu_to_be32(ni_pi->clock_registers.cg_spll_spread_spectrum_2); - table->initialState.levels[0].sclk.sclk_value = + table->initialState.level.sclk.sclk_value = cpu_to_be32(initial_state->performance_levels[0].sclk); - table->initialState.levels[0].arbRefreshState = + table->initialState.level.arbRefreshState = NISLANDS_INITIAL_STATE_ARB_INDEX; - table->initialState.levels[0].ACIndex = 0; + table->initialState.level.ACIndex = 0; ret = ni_populate_voltage_value(rdev, &eg_pi->vddc_voltage_table, initial_state->performance_levels[0].vddc, - &table->initialState.levels[0].vddc); + &table->initialState.level.vddc); if (!ret) { u16 std_vddc; ret = ni_get_std_voltage_value(rdev, - &table->initialState.levels[0].vddc, + &table->initialState.level.vddc, &std_vddc); if (!ret) ni_populate_std_voltage_value(rdev, std_vddc, - table->initialState.levels[0].vddc.index, - &table->initialState.levels[0].std_vddc); + table->initialState.level.vddc.index, + &table->initialState.level.std_vddc); } if (eg_pi->vddci_control) ni_populate_voltage_value(rdev, &eg_pi->vddci_voltage_table, initial_state->performance_levels[0].vddci, - &table->initialState.levels[0].vddci); + &table->initialState.level.vddci); - ni_populate_initial_mvdd_value(rdev, &table->initialState.levels[0].mvdd); + ni_populate_initial_mvdd_value(rdev, &table->initialState.level.mvdd); reg = CG_R(0xffff) | CG_L(0); - table->initialState.levels[0].aT = cpu_to_be32(reg); + table->initialState.level.aT = cpu_to_be32(reg); - table->initialState.levels[0].bSP = cpu_to_be32(pi->dsp); + table->initialState.level.bSP = cpu_to_be32(pi->dsp); if (pi->boot_in_gen2) - table->initialState.levels[0].gen2PCIE = 1; + table->initialState.level.gen2PCIE = 1; else - table->initialState.levels[0].gen2PCIE = 0; + table->initialState.level.gen2PCIE = 0; if (pi->mem_gddr5) { - table->initialState.levels[0].strobeMode = + table->initialState.level.strobeMode = cypress_get_strobe_mode_settings(rdev, initial_state->performance_levels[0].mclk); if (initial_state->performance_levels[0].mclk > pi->mclk_edc_enable_threshold) - table->initialState.levels[0].mcFlags = NISLANDS_SMC_MC_EDC_RD_FLAG | NISLANDS_SMC_MC_EDC_WR_FLAG; + table->initialState.level.mcFlags = NISLANDS_SMC_MC_EDC_RD_FLAG | NISLANDS_SMC_MC_EDC_WR_FLAG; else - table->initialState.levels[0].mcFlags = 0; + table->initialState.level.mcFlags = 0; } table->initialState.levelCount = 1; table->initialState.flags |= PPSMC_SWSTATE_FLAG_DC; - table->initialState.levels[0].dpm2.MaxPS = 0; - table->initialState.levels[0].dpm2.NearTDPDec = 0; - table->initialState.levels[0].dpm2.AboveSafeInc = 0; - table->initialState.levels[0].dpm2.BelowSafeInc = 0; + table->initialState.level.dpm2.MaxPS = 0; + table->initialState.level.dpm2.NearTDPDec = 0; + table->initialState.level.dpm2.AboveSafeInc = 0; + table->initialState.level.dpm2.BelowSafeInc = 0; reg = MIN_POWER_MASK | MAX_POWER_MASK; - table->initialState.levels[0].SQPowerThrottle = cpu_to_be32(reg); + table->initialState.level.SQPowerThrottle = cpu_to_be32(reg); reg = MAX_POWER_DELTA_MASK | STI_SIZE_MASK | LTI_RATIO_MASK; - table->initialState.levels[0].SQPowerThrottle_2 = cpu_to_be32(reg); + table->initialState.level.SQPowerThrottle_2 = cpu_to_be32(reg); return 0; } @@ -1813,43 +1813,43 @@ static int ni_populate_smc_acpi_state(struct radeon_device *rdev, if (pi->acpi_vddc) { ret = ni_populate_voltage_value(rdev, &eg_pi->vddc_voltage_table, - pi->acpi_vddc, &table->ACPIState.levels[0].vddc); + pi->acpi_vddc, &table->ACPIState.level.vddc); if (!ret) { u16 std_vddc; ret = ni_get_std_voltage_value(rdev, - &table->ACPIState.levels[0].vddc, &std_vddc); + &table->ACPIState.level.vddc, &std_vddc); if (!ret) ni_populate_std_voltage_value(rdev, std_vddc, - table->ACPIState.levels[0].vddc.index, - &table->ACPIState.levels[0].std_vddc); + table->ACPIState.level.vddc.index, + &table->ACPIState.level.std_vddc); } if (pi->pcie_gen2) { if (pi->acpi_pcie_gen2) - table->ACPIState.levels[0].gen2PCIE = 1; + table->ACPIState.level.gen2PCIE = 1; else - table->ACPIState.levels[0].gen2PCIE = 0; + table->ACPIState.level.gen2PCIE = 0; } else { - table->ACPIState.levels[0].gen2PCIE = 0; + table->ACPIState.level.gen2PCIE = 0; } } else { ret = ni_populate_voltage_value(rdev, &eg_pi->vddc_voltage_table, pi->min_vddc_in_table, - &table->ACPIState.levels[0].vddc); + &table->ACPIState.level.vddc); if (!ret) { u16 std_vddc; ret = ni_get_std_voltage_value(rdev, - &table->ACPIState.levels[0].vddc, + &table->ACPIState.level.vddc, &std_vddc); if (!ret) ni_populate_std_voltage_value(rdev, std_vddc, - table->ACPIState.levels[0].vddc.index, - &table->ACPIState.levels[0].std_vddc); + table->ACPIState.level.vddc.index, + &table->ACPIState.level.std_vddc); } - table->ACPIState.levels[0].gen2PCIE = 0; + table->ACPIState.level.gen2PCIE = 0; } if (eg_pi->acpi_vddci) { @@ -1857,7 +1857,7 @@ static int ni_populate_smc_acpi_state(struct radeon_device *rdev, ni_populate_voltage_value(rdev, &eg_pi->vddci_voltage_table, eg_pi->acpi_vddci, - &table->ACPIState.levels[0].vddci); + &table->ACPIState.level.vddci); } @@ -1900,37 +1900,37 @@ static int ni_populate_smc_acpi_state(struct radeon_device *rdev, spll_func_cntl_2 &= ~SCLK_MUX_SEL_MASK; spll_func_cntl_2 |= SCLK_MUX_SEL(4); - table->ACPIState.levels[0].mclk.vMPLL_AD_FUNC_CNTL = cpu_to_be32(mpll_ad_func_cntl); - table->ACPIState.levels[0].mclk.vMPLL_AD_FUNC_CNTL_2 = cpu_to_be32(mpll_ad_func_cntl_2); - table->ACPIState.levels[0].mclk.vMPLL_DQ_FUNC_CNTL = cpu_to_be32(mpll_dq_func_cntl); - table->ACPIState.levels[0].mclk.vMPLL_DQ_FUNC_CNTL_2 = cpu_to_be32(mpll_dq_func_cntl_2); - table->ACPIState.levels[0].mclk.vMCLK_PWRMGT_CNTL = cpu_to_be32(mclk_pwrmgt_cntl); - table->ACPIState.levels[0].mclk.vDLL_CNTL = cpu_to_be32(dll_cntl); + table->ACPIState.level.mclk.vMPLL_AD_FUNC_CNTL = cpu_to_be32(mpll_ad_func_cntl); + table->ACPIState.level.mclk.vMPLL_AD_FUNC_CNTL_2 = cpu_to_be32(mpll_ad_func_cntl_2); + table->ACPIState.level.mclk.vMPLL_DQ_FUNC_CNTL = cpu_to_be32(mpll_dq_func_cntl); + table->ACPIState.level.mclk.vMPLL_DQ_FUNC_CNTL_2 = cpu_to_be32(mpll_dq_func_cntl_2); + table->ACPIState.level.mclk.vMCLK_PWRMGT_CNTL = cpu_to_be32(mclk_pwrmgt_cntl); + table->ACPIState.level.mclk.vDLL_CNTL = cpu_to_be32(dll_cntl); - table->ACPIState.levels[0].mclk.mclk_value = 0; + table->ACPIState.level.mclk.mclk_value = 0; - table->ACPIState.levels[0].sclk.vCG_SPLL_FUNC_CNTL = cpu_to_be32(spll_func_cntl); - table->ACPIState.levels[0].sclk.vCG_SPLL_FUNC_CNTL_2 = cpu_to_be32(spll_func_cntl_2); - table->ACPIState.levels[0].sclk.vCG_SPLL_FUNC_CNTL_3 = cpu_to_be32(spll_func_cntl_3); - table->ACPIState.levels[0].sclk.vCG_SPLL_FUNC_CNTL_4 = cpu_to_be32(spll_func_cntl_4); + table->ACPIState.level.sclk.vCG_SPLL_FUNC_CNTL = cpu_to_be32(spll_func_cntl); + table->ACPIState.level.sclk.vCG_SPLL_FUNC_CNTL_2 = cpu_to_be32(spll_func_cntl_2); + table->ACPIState.level.sclk.vCG_SPLL_FUNC_CNTL_3 = cpu_to_be32(spll_func_cntl_3); + table->ACPIState.level.sclk.vCG_SPLL_FUNC_CNTL_4 = cpu_to_be32(spll_func_cntl_4); - table->ACPIState.levels[0].sclk.sclk_value = 0; + table->ACPIState.level.sclk.sclk_value = 0; - ni_populate_mvdd_value(rdev, 0, &table->ACPIState.levels[0].mvdd); + ni_populate_mvdd_value(rdev, 0, &table->ACPIState.level.mvdd); if (eg_pi->dynamic_ac_timing) - table->ACPIState.levels[0].ACIndex = 1; + table->ACPIState.level.ACIndex = 1; - table->ACPIState.levels[0].dpm2.MaxPS = 0; - table->ACPIState.levels[0].dpm2.NearTDPDec = 0; - table->ACPIState.levels[0].dpm2.AboveSafeInc = 0; - table->ACPIState.levels[0].dpm2.BelowSafeInc = 0; + table->ACPIState.level.dpm2.MaxPS = 0; + table->ACPIState.level.dpm2.NearTDPDec = 0; + table->ACPIState.level.dpm2.AboveSafeInc = 0; + table->ACPIState.level.dpm2.BelowSafeInc = 0; reg = MIN_POWER_MASK | MAX_POWER_MASK; - table->ACPIState.levels[0].SQPowerThrottle = cpu_to_be32(reg); + table->ACPIState.level.SQPowerThrottle = cpu_to_be32(reg); reg = MAX_POWER_DELTA_MASK | STI_SIZE_MASK | LTI_RATIO_MASK; - table->ACPIState.levels[0].SQPowerThrottle_2 = cpu_to_be32(reg); + table->ACPIState.level.SQPowerThrottle_2 = cpu_to_be32(reg); return 0; } @@ -1980,7 +1980,9 @@ static int ni_init_smc_table(struct radeon_device *rdev) if (ret) return ret; - table->driverState = table->initialState; + table->driverState.flags = table->initialState.flags; + table->driverState.levelCount = table->initialState.levelCount; + table->driverState.levels[0] = table->initialState.level; table->ULVState = table->initialState; diff --git a/drivers/gpu/drm/radeon/nislands_smc.h b/drivers/gpu/drm/radeon/nislands_smc.h index 7395cb6b3cac6..42f3bab0f9ee6 100644 --- a/drivers/gpu/drm/radeon/nislands_smc.h +++ b/drivers/gpu/drm/radeon/nislands_smc.h @@ -143,6 +143,14 @@ struct NISLANDS_SMC_SWSTATE typedef struct NISLANDS_SMC_SWSTATE NISLANDS_SMC_SWSTATE; +struct NISLANDS_SMC_SWSTATE_SINGLE { + uint8_t flags; + uint8_t levelCount; + uint8_t padding2; + uint8_t padding3; + NISLANDS_SMC_HW_PERFORMANCE_LEVEL level; +}; + #define NISLANDS_SMC_VOLTAGEMASK_VDDC 0 #define NISLANDS_SMC_VOLTAGEMASK_MVDD 1 #define NISLANDS_SMC_VOLTAGEMASK_VDDCI 2 @@ -160,19 +168,19 @@ typedef struct NISLANDS_SMC_VOLTAGEMASKTABLE NISLANDS_SMC_VOLTAGEMASKTABLE; struct NISLANDS_SMC_STATETABLE { - uint8_t thermalProtectType; - uint8_t systemFlags; - uint8_t maxVDDCIndexInPPTable; - uint8_t extraFlags; - uint8_t highSMIO[NISLANDS_MAX_NO_VREG_STEPS]; - uint32_t lowSMIO[NISLANDS_MAX_NO_VREG_STEPS]; - NISLANDS_SMC_VOLTAGEMASKTABLE voltageMaskTable; - PP_NIslands_DPM2Parameters dpm2Params; - NISLANDS_SMC_SWSTATE initialState; - NISLANDS_SMC_SWSTATE ACPIState; - NISLANDS_SMC_SWSTATE ULVState; - NISLANDS_SMC_SWSTATE driverState; - NISLANDS_SMC_HW_PERFORMANCE_LEVEL dpmLevels[NISLANDS_MAX_SMC_PERFORMANCE_LEVELS_PER_SWSTATE - 1]; + uint8_t thermalProtectType; + uint8_t systemFlags; + uint8_t maxVDDCIndexInPPTable; + uint8_t extraFlags; + uint8_t highSMIO[NISLANDS_MAX_NO_VREG_STEPS]; + uint32_t lowSMIO[NISLANDS_MAX_NO_VREG_STEPS]; + NISLANDS_SMC_VOLTAGEMASKTABLE voltageMaskTable; + PP_NIslands_DPM2Parameters dpm2Params; + struct NISLANDS_SMC_SWSTATE_SINGLE initialState; + struct NISLANDS_SMC_SWSTATE_SINGLE ACPIState; + struct NISLANDS_SMC_SWSTATE_SINGLE ULVState; + NISLANDS_SMC_SWSTATE driverState; + NISLANDS_SMC_HW_PERFORMANCE_LEVEL dpmLevels[NISLANDS_MAX_SMC_PERFORMANCE_LEVELS_PER_SWSTATE]; }; typedef struct NISLANDS_SMC_STATETABLE NISLANDS_SMC_STATETABLE; From 1ddeedaa28e14c4e40c95e3d8026d69eef47eaba Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Sun, 9 May 2021 17:55:25 -0500 Subject: [PATCH 219/263] drm/radeon/si_dpm: Fix SMU power state load Create new structure SISLANDS_SMC_SWSTATE_SINGLE, as initialState.levels and ACPIState.levels are never actually used as flexible arrays. Those arrays can be used as simple objects of type SISLANDS_SMC_HW_PERFORMANCE_LEVEL, instead. Currently, the code fails because flexible array _levels_ in struct SISLANDS_SMC_SWSTATE doesn't allow for code that access the first element of initialState.levels and ACPIState.levels arrays: 4353 table->initialState.levels[0].mclk.vDLL_CNTL = 4354 cpu_to_be32(si_pi->clock_registers.dll_cntl); ... 4555 table->ACPIState.levels[0].mclk.vDLL_CNTL = 4556 cpu_to_be32(dll_cntl); because such element cannot exist without previously allocating any dynamic memory for it (which never actually happens). That's why struct SISLANDS_SMC_SWSTATE should only be used as type for object driverState and new struct SISLANDS_SMC_SWSTATE_SINGLE is created as type for objects initialState, ACPIState and ULVState. Also, with the change from one-element array to flexible-array member in commit 96e27e8d919e ("drm/radeon/si_dpm: Replace one-element array with flexible-array in struct SISLANDS_SMC_SWSTATE"), the size of dpmLevels in struct SISLANDS_SMC_STATETABLE should be fixed to be SISLANDS_MAX_SMC_PERFORMANCE_LEVELS_PER_SWSTATE instead of SISLANDS_MAX_SMC_PERFORMANCE_LEVELS_PER_SWSTATE - 1. Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/1583 Fixes: 96e27e8d919e ("drm/radeon/si_dpm: Replace one-element array with flexible-array in struct SISLANDS_SMC_SWSTATE") Cc: stable@vger.kernel.org Reported-by: Kai-Heng Feng Tested-by: Kai-Heng Feng Signed-off-by: Gustavo A. R. Silva Signed-off-by: Alex Deucher --- drivers/gpu/drm/radeon/si_dpm.c | 174 +++++++++++++------------- drivers/gpu/drm/radeon/sislands_smc.h | 34 +++-- 2 files changed, 109 insertions(+), 99 deletions(-) diff --git a/drivers/gpu/drm/radeon/si_dpm.c b/drivers/gpu/drm/radeon/si_dpm.c index 9186095518047..2c54c0d7ca5be 100644 --- a/drivers/gpu/drm/radeon/si_dpm.c +++ b/drivers/gpu/drm/radeon/si_dpm.c @@ -4350,70 +4350,70 @@ static int si_populate_smc_initial_state(struct radeon_device *rdev, u32 reg; int ret; - table->initialState.levels[0].mclk.vDLL_CNTL = + table->initialState.level.mclk.vDLL_CNTL = cpu_to_be32(si_pi->clock_registers.dll_cntl); - table->initialState.levels[0].mclk.vMCLK_PWRMGT_CNTL = + table->initialState.level.mclk.vMCLK_PWRMGT_CNTL = cpu_to_be32(si_pi->clock_registers.mclk_pwrmgt_cntl); - table->initialState.levels[0].mclk.vMPLL_AD_FUNC_CNTL = + table->initialState.level.mclk.vMPLL_AD_FUNC_CNTL = cpu_to_be32(si_pi->clock_registers.mpll_ad_func_cntl); - table->initialState.levels[0].mclk.vMPLL_DQ_FUNC_CNTL = + table->initialState.level.mclk.vMPLL_DQ_FUNC_CNTL = cpu_to_be32(si_pi->clock_registers.mpll_dq_func_cntl); - table->initialState.levels[0].mclk.vMPLL_FUNC_CNTL = + table->initialState.level.mclk.vMPLL_FUNC_CNTL = cpu_to_be32(si_pi->clock_registers.mpll_func_cntl); - table->initialState.levels[0].mclk.vMPLL_FUNC_CNTL_1 = + table->initialState.level.mclk.vMPLL_FUNC_CNTL_1 = cpu_to_be32(si_pi->clock_registers.mpll_func_cntl_1); - table->initialState.levels[0].mclk.vMPLL_FUNC_CNTL_2 = + table->initialState.level.mclk.vMPLL_FUNC_CNTL_2 = cpu_to_be32(si_pi->clock_registers.mpll_func_cntl_2); - table->initialState.levels[0].mclk.vMPLL_SS = + table->initialState.level.mclk.vMPLL_SS = cpu_to_be32(si_pi->clock_registers.mpll_ss1); - table->initialState.levels[0].mclk.vMPLL_SS2 = + table->initialState.level.mclk.vMPLL_SS2 = cpu_to_be32(si_pi->clock_registers.mpll_ss2); - table->initialState.levels[0].mclk.mclk_value = + table->initialState.level.mclk.mclk_value = cpu_to_be32(initial_state->performance_levels[0].mclk); - table->initialState.levels[0].sclk.vCG_SPLL_FUNC_CNTL = + table->initialState.level.sclk.vCG_SPLL_FUNC_CNTL = cpu_to_be32(si_pi->clock_registers.cg_spll_func_cntl); - table->initialState.levels[0].sclk.vCG_SPLL_FUNC_CNTL_2 = + table->initialState.level.sclk.vCG_SPLL_FUNC_CNTL_2 = cpu_to_be32(si_pi->clock_registers.cg_spll_func_cntl_2); - table->initialState.levels[0].sclk.vCG_SPLL_FUNC_CNTL_3 = + table->initialState.level.sclk.vCG_SPLL_FUNC_CNTL_3 = cpu_to_be32(si_pi->clock_registers.cg_spll_func_cntl_3); - table->initialState.levels[0].sclk.vCG_SPLL_FUNC_CNTL_4 = + table->initialState.level.sclk.vCG_SPLL_FUNC_CNTL_4 = cpu_to_be32(si_pi->clock_registers.cg_spll_func_cntl_4); - table->initialState.levels[0].sclk.vCG_SPLL_SPREAD_SPECTRUM = + table->initialState.level.sclk.vCG_SPLL_SPREAD_SPECTRUM = cpu_to_be32(si_pi->clock_registers.cg_spll_spread_spectrum); - table->initialState.levels[0].sclk.vCG_SPLL_SPREAD_SPECTRUM_2 = + table->initialState.level.sclk.vCG_SPLL_SPREAD_SPECTRUM_2 = cpu_to_be32(si_pi->clock_registers.cg_spll_spread_spectrum_2); - table->initialState.levels[0].sclk.sclk_value = + table->initialState.level.sclk.sclk_value = cpu_to_be32(initial_state->performance_levels[0].sclk); - table->initialState.levels[0].arbRefreshState = + table->initialState.level.arbRefreshState = SISLANDS_INITIAL_STATE_ARB_INDEX; - table->initialState.levels[0].ACIndex = 0; + table->initialState.level.ACIndex = 0; ret = si_populate_voltage_value(rdev, &eg_pi->vddc_voltage_table, initial_state->performance_levels[0].vddc, - &table->initialState.levels[0].vddc); + &table->initialState.level.vddc); if (!ret) { u16 std_vddc; ret = si_get_std_voltage_value(rdev, - &table->initialState.levels[0].vddc, + &table->initialState.level.vddc, &std_vddc); if (!ret) si_populate_std_voltage_value(rdev, std_vddc, - table->initialState.levels[0].vddc.index, - &table->initialState.levels[0].std_vddc); + table->initialState.level.vddc.index, + &table->initialState.level.std_vddc); } if (eg_pi->vddci_control) si_populate_voltage_value(rdev, &eg_pi->vddci_voltage_table, initial_state->performance_levels[0].vddci, - &table->initialState.levels[0].vddci); + &table->initialState.level.vddci); if (si_pi->vddc_phase_shed_control) si_populate_phase_shedding_value(rdev, @@ -4421,43 +4421,43 @@ static int si_populate_smc_initial_state(struct radeon_device *rdev, initial_state->performance_levels[0].vddc, initial_state->performance_levels[0].sclk, initial_state->performance_levels[0].mclk, - &table->initialState.levels[0].vddc); + &table->initialState.level.vddc); - si_populate_initial_mvdd_value(rdev, &table->initialState.levels[0].mvdd); + si_populate_initial_mvdd_value(rdev, &table->initialState.level.mvdd); reg = CG_R(0xffff) | CG_L(0); - table->initialState.levels[0].aT = cpu_to_be32(reg); + table->initialState.level.aT = cpu_to_be32(reg); - table->initialState.levels[0].bSP = cpu_to_be32(pi->dsp); + table->initialState.level.bSP = cpu_to_be32(pi->dsp); - table->initialState.levels[0].gen2PCIE = (u8)si_pi->boot_pcie_gen; + table->initialState.level.gen2PCIE = (u8)si_pi->boot_pcie_gen; if (pi->mem_gddr5) { - table->initialState.levels[0].strobeMode = + table->initialState.level.strobeMode = si_get_strobe_mode_settings(rdev, initial_state->performance_levels[0].mclk); if (initial_state->performance_levels[0].mclk > pi->mclk_edc_enable_threshold) - table->initialState.levels[0].mcFlags = SISLANDS_SMC_MC_EDC_RD_FLAG | SISLANDS_SMC_MC_EDC_WR_FLAG; + table->initialState.level.mcFlags = SISLANDS_SMC_MC_EDC_RD_FLAG | SISLANDS_SMC_MC_EDC_WR_FLAG; else - table->initialState.levels[0].mcFlags = 0; + table->initialState.level.mcFlags = 0; } table->initialState.levelCount = 1; table->initialState.flags |= PPSMC_SWSTATE_FLAG_DC; - table->initialState.levels[0].dpm2.MaxPS = 0; - table->initialState.levels[0].dpm2.NearTDPDec = 0; - table->initialState.levels[0].dpm2.AboveSafeInc = 0; - table->initialState.levels[0].dpm2.BelowSafeInc = 0; - table->initialState.levels[0].dpm2.PwrEfficiencyRatio = 0; + table->initialState.level.dpm2.MaxPS = 0; + table->initialState.level.dpm2.NearTDPDec = 0; + table->initialState.level.dpm2.AboveSafeInc = 0; + table->initialState.level.dpm2.BelowSafeInc = 0; + table->initialState.level.dpm2.PwrEfficiencyRatio = 0; reg = MIN_POWER_MASK | MAX_POWER_MASK; - table->initialState.levels[0].SQPowerThrottle = cpu_to_be32(reg); + table->initialState.level.SQPowerThrottle = cpu_to_be32(reg); reg = MAX_POWER_DELTA_MASK | STI_SIZE_MASK | LTI_RATIO_MASK; - table->initialState.levels[0].SQPowerThrottle_2 = cpu_to_be32(reg); + table->initialState.level.SQPowerThrottle_2 = cpu_to_be32(reg); return 0; } @@ -4488,18 +4488,18 @@ static int si_populate_smc_acpi_state(struct radeon_device *rdev, if (pi->acpi_vddc) { ret = si_populate_voltage_value(rdev, &eg_pi->vddc_voltage_table, - pi->acpi_vddc, &table->ACPIState.levels[0].vddc); + pi->acpi_vddc, &table->ACPIState.level.vddc); if (!ret) { u16 std_vddc; ret = si_get_std_voltage_value(rdev, - &table->ACPIState.levels[0].vddc, &std_vddc); + &table->ACPIState.level.vddc, &std_vddc); if (!ret) si_populate_std_voltage_value(rdev, std_vddc, - table->ACPIState.levels[0].vddc.index, - &table->ACPIState.levels[0].std_vddc); + table->ACPIState.level.vddc.index, + &table->ACPIState.level.std_vddc); } - table->ACPIState.levels[0].gen2PCIE = si_pi->acpi_pcie_gen; + table->ACPIState.level.gen2PCIE = si_pi->acpi_pcie_gen; if (si_pi->vddc_phase_shed_control) { si_populate_phase_shedding_value(rdev, @@ -4507,23 +4507,23 @@ static int si_populate_smc_acpi_state(struct radeon_device *rdev, pi->acpi_vddc, 0, 0, - &table->ACPIState.levels[0].vddc); + &table->ACPIState.level.vddc); } } else { ret = si_populate_voltage_value(rdev, &eg_pi->vddc_voltage_table, - pi->min_vddc_in_table, &table->ACPIState.levels[0].vddc); + pi->min_vddc_in_table, &table->ACPIState.level.vddc); if (!ret) { u16 std_vddc; ret = si_get_std_voltage_value(rdev, - &table->ACPIState.levels[0].vddc, &std_vddc); + &table->ACPIState.level.vddc, &std_vddc); if (!ret) si_populate_std_voltage_value(rdev, std_vddc, - table->ACPIState.levels[0].vddc.index, - &table->ACPIState.levels[0].std_vddc); + table->ACPIState.level.vddc.index, + &table->ACPIState.level.std_vddc); } - table->ACPIState.levels[0].gen2PCIE = (u8)r600_get_pcie_gen_support(rdev, + table->ACPIState.level.gen2PCIE = (u8)r600_get_pcie_gen_support(rdev, si_pi->sys_pcie_mask, si_pi->boot_pcie_gen, RADEON_PCIE_GEN1); @@ -4534,14 +4534,14 @@ static int si_populate_smc_acpi_state(struct radeon_device *rdev, pi->min_vddc_in_table, 0, 0, - &table->ACPIState.levels[0].vddc); + &table->ACPIState.level.vddc); } if (pi->acpi_vddc) { if (eg_pi->acpi_vddci) si_populate_voltage_value(rdev, &eg_pi->vddci_voltage_table, eg_pi->acpi_vddci, - &table->ACPIState.levels[0].vddci); + &table->ACPIState.level.vddci); } mclk_pwrmgt_cntl |= MRDCK0_RESET | MRDCK1_RESET; @@ -4552,59 +4552,59 @@ static int si_populate_smc_acpi_state(struct radeon_device *rdev, spll_func_cntl_2 &= ~SCLK_MUX_SEL_MASK; spll_func_cntl_2 |= SCLK_MUX_SEL(4); - table->ACPIState.levels[0].mclk.vDLL_CNTL = + table->ACPIState.level.mclk.vDLL_CNTL = cpu_to_be32(dll_cntl); - table->ACPIState.levels[0].mclk.vMCLK_PWRMGT_CNTL = + table->ACPIState.level.mclk.vMCLK_PWRMGT_CNTL = cpu_to_be32(mclk_pwrmgt_cntl); - table->ACPIState.levels[0].mclk.vMPLL_AD_FUNC_CNTL = + table->ACPIState.level.mclk.vMPLL_AD_FUNC_CNTL = cpu_to_be32(mpll_ad_func_cntl); - table->ACPIState.levels[0].mclk.vMPLL_DQ_FUNC_CNTL = + table->ACPIState.level.mclk.vMPLL_DQ_FUNC_CNTL = cpu_to_be32(mpll_dq_func_cntl); - table->ACPIState.levels[0].mclk.vMPLL_FUNC_CNTL = + table->ACPIState.level.mclk.vMPLL_FUNC_CNTL = cpu_to_be32(mpll_func_cntl); - table->ACPIState.levels[0].mclk.vMPLL_FUNC_CNTL_1 = + table->ACPIState.level.mclk.vMPLL_FUNC_CNTL_1 = cpu_to_be32(mpll_func_cntl_1); - table->ACPIState.levels[0].mclk.vMPLL_FUNC_CNTL_2 = + table->ACPIState.level.mclk.vMPLL_FUNC_CNTL_2 = cpu_to_be32(mpll_func_cntl_2); - table->ACPIState.levels[0].mclk.vMPLL_SS = + table->ACPIState.level.mclk.vMPLL_SS = cpu_to_be32(si_pi->clock_registers.mpll_ss1); - table->ACPIState.levels[0].mclk.vMPLL_SS2 = + table->ACPIState.level.mclk.vMPLL_SS2 = cpu_to_be32(si_pi->clock_registers.mpll_ss2); - table->ACPIState.levels[0].sclk.vCG_SPLL_FUNC_CNTL = + table->ACPIState.level.sclk.vCG_SPLL_FUNC_CNTL = cpu_to_be32(spll_func_cntl); - table->ACPIState.levels[0].sclk.vCG_SPLL_FUNC_CNTL_2 = + table->ACPIState.level.sclk.vCG_SPLL_FUNC_CNTL_2 = cpu_to_be32(spll_func_cntl_2); - table->ACPIState.levels[0].sclk.vCG_SPLL_FUNC_CNTL_3 = + table->ACPIState.level.sclk.vCG_SPLL_FUNC_CNTL_3 = cpu_to_be32(spll_func_cntl_3); - table->ACPIState.levels[0].sclk.vCG_SPLL_FUNC_CNTL_4 = + table->ACPIState.level.sclk.vCG_SPLL_FUNC_CNTL_4 = cpu_to_be32(spll_func_cntl_4); - table->ACPIState.levels[0].mclk.mclk_value = 0; - table->ACPIState.levels[0].sclk.sclk_value = 0; + table->ACPIState.level.mclk.mclk_value = 0; + table->ACPIState.level.sclk.sclk_value = 0; - si_populate_mvdd_value(rdev, 0, &table->ACPIState.levels[0].mvdd); + si_populate_mvdd_value(rdev, 0, &table->ACPIState.level.mvdd); if (eg_pi->dynamic_ac_timing) - table->ACPIState.levels[0].ACIndex = 0; + table->ACPIState.level.ACIndex = 0; - table->ACPIState.levels[0].dpm2.MaxPS = 0; - table->ACPIState.levels[0].dpm2.NearTDPDec = 0; - table->ACPIState.levels[0].dpm2.AboveSafeInc = 0; - table->ACPIState.levels[0].dpm2.BelowSafeInc = 0; - table->ACPIState.levels[0].dpm2.PwrEfficiencyRatio = 0; + table->ACPIState.level.dpm2.MaxPS = 0; + table->ACPIState.level.dpm2.NearTDPDec = 0; + table->ACPIState.level.dpm2.AboveSafeInc = 0; + table->ACPIState.level.dpm2.BelowSafeInc = 0; + table->ACPIState.level.dpm2.PwrEfficiencyRatio = 0; reg = MIN_POWER_MASK | MAX_POWER_MASK; - table->ACPIState.levels[0].SQPowerThrottle = cpu_to_be32(reg); + table->ACPIState.level.SQPowerThrottle = cpu_to_be32(reg); reg = MAX_POWER_DELTA_MASK | STI_SIZE_MASK | LTI_RATIO_MASK; - table->ACPIState.levels[0].SQPowerThrottle_2 = cpu_to_be32(reg); + table->ACPIState.level.SQPowerThrottle_2 = cpu_to_be32(reg); return 0; } static int si_populate_ulv_state(struct radeon_device *rdev, - SISLANDS_SMC_SWSTATE *state) + struct SISLANDS_SMC_SWSTATE_SINGLE *state) { struct evergreen_power_info *eg_pi = evergreen_get_pi(rdev); struct si_power_info *si_pi = si_get_pi(rdev); @@ -4613,19 +4613,19 @@ static int si_populate_ulv_state(struct radeon_device *rdev, int ret; ret = si_convert_power_level_to_smc(rdev, &ulv->pl, - &state->levels[0]); + &state->level); if (!ret) { if (eg_pi->sclk_deep_sleep) { if (sclk_in_sr <= SCLK_MIN_DEEPSLEEP_FREQ) - state->levels[0].stateFlags |= PPSMC_STATEFLAG_DEEPSLEEP_BYPASS; + state->level.stateFlags |= PPSMC_STATEFLAG_DEEPSLEEP_BYPASS; else - state->levels[0].stateFlags |= PPSMC_STATEFLAG_DEEPSLEEP_THROTTLE; + state->level.stateFlags |= PPSMC_STATEFLAG_DEEPSLEEP_THROTTLE; } if (ulv->one_pcie_lane_in_ulv) state->flags |= PPSMC_SWSTATE_FLAG_PCIE_X1; - state->levels[0].arbRefreshState = (u8)(SISLANDS_ULV_STATE_ARB_INDEX); - state->levels[0].ACIndex = 1; - state->levels[0].std_vddc = state->levels[0].vddc; + state->level.arbRefreshState = (u8)(SISLANDS_ULV_STATE_ARB_INDEX); + state->level.ACIndex = 1; + state->level.std_vddc = state->level.vddc; state->levelCount = 1; state->flags |= PPSMC_SWSTATE_FLAG_DC; @@ -4725,7 +4725,9 @@ static int si_init_smc_table(struct radeon_device *rdev) if (ret) return ret; - table->driverState = table->initialState; + table->driverState.flags = table->initialState.flags; + table->driverState.levelCount = table->initialState.levelCount; + table->driverState.levels[0] = table->initialState.level; ret = si_do_program_memory_timing_parameters(rdev, radeon_boot_state, SISLANDS_INITIAL_STATE_ARB_INDEX); @@ -5275,8 +5277,8 @@ static int si_upload_ulv_state(struct radeon_device *rdev) if (ulv->supported && ulv->pl.vddc) { u32 address = si_pi->state_table_start + offsetof(SISLANDS_SMC_STATETABLE, ULVState); - SISLANDS_SMC_SWSTATE *smc_state = &si_pi->smc_statetable.ULVState; - u32 state_size = sizeof(SISLANDS_SMC_SWSTATE); + struct SISLANDS_SMC_SWSTATE_SINGLE *smc_state = &si_pi->smc_statetable.ULVState; + u32 state_size = sizeof(struct SISLANDS_SMC_SWSTATE_SINGLE); memset(smc_state, 0, state_size); diff --git a/drivers/gpu/drm/radeon/sislands_smc.h b/drivers/gpu/drm/radeon/sislands_smc.h index fbd6589bdab92..4ea1cb2e45a3c 100644 --- a/drivers/gpu/drm/radeon/sislands_smc.h +++ b/drivers/gpu/drm/radeon/sislands_smc.h @@ -191,6 +191,14 @@ struct SISLANDS_SMC_SWSTATE typedef struct SISLANDS_SMC_SWSTATE SISLANDS_SMC_SWSTATE; +struct SISLANDS_SMC_SWSTATE_SINGLE { + uint8_t flags; + uint8_t levelCount; + uint8_t padding2; + uint8_t padding3; + SISLANDS_SMC_HW_PERFORMANCE_LEVEL level; +}; + #define SISLANDS_SMC_VOLTAGEMASK_VDDC 0 #define SISLANDS_SMC_VOLTAGEMASK_MVDD 1 #define SISLANDS_SMC_VOLTAGEMASK_VDDCI 2 @@ -208,19 +216,19 @@ typedef struct SISLANDS_SMC_VOLTAGEMASKTABLE SISLANDS_SMC_VOLTAGEMASKTABLE; struct SISLANDS_SMC_STATETABLE { - uint8_t thermalProtectType; - uint8_t systemFlags; - uint8_t maxVDDCIndexInPPTable; - uint8_t extraFlags; - uint32_t lowSMIO[SISLANDS_MAX_NO_VREG_STEPS]; - SISLANDS_SMC_VOLTAGEMASKTABLE voltageMaskTable; - SISLANDS_SMC_VOLTAGEMASKTABLE phaseMaskTable; - PP_SIslands_DPM2Parameters dpm2Params; - SISLANDS_SMC_SWSTATE initialState; - SISLANDS_SMC_SWSTATE ACPIState; - SISLANDS_SMC_SWSTATE ULVState; - SISLANDS_SMC_SWSTATE driverState; - SISLANDS_SMC_HW_PERFORMANCE_LEVEL dpmLevels[SISLANDS_MAX_SMC_PERFORMANCE_LEVELS_PER_SWSTATE - 1]; + uint8_t thermalProtectType; + uint8_t systemFlags; + uint8_t maxVDDCIndexInPPTable; + uint8_t extraFlags; + uint32_t lowSMIO[SISLANDS_MAX_NO_VREG_STEPS]; + SISLANDS_SMC_VOLTAGEMASKTABLE voltageMaskTable; + SISLANDS_SMC_VOLTAGEMASKTABLE phaseMaskTable; + PP_SIslands_DPM2Parameters dpm2Params; + struct SISLANDS_SMC_SWSTATE_SINGLE initialState; + struct SISLANDS_SMC_SWSTATE_SINGLE ACPIState; + struct SISLANDS_SMC_SWSTATE_SINGLE ULVState; + SISLANDS_SMC_SWSTATE driverState; + SISLANDS_SMC_HW_PERFORMANCE_LEVEL dpmLevels[SISLANDS_MAX_SMC_PERFORMANCE_LEVELS_PER_SWSTATE]; }; typedef struct SISLANDS_SMC_STATETABLE SISLANDS_SMC_STATETABLE; From 939baec9e895e75149327c01b775f46c21e12be5 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 10 May 2021 15:46:18 -0500 Subject: [PATCH 220/263] drm/amd/pm: Fix out-of-bounds bug Create new structure SISLANDS_SMC_SWSTATE_SINGLE, as initialState.levels and ACPIState.levels are never actually used as flexible arrays. Those arrays can be used as simple objects of type SISLANDS_SMC_HW_PERFORMANCE_LEVEL, instead. Currently, the code fails because flexible array _levels_ in struct SISLANDS_SMC_SWSTATE doesn't allow for code that accesses the first element of initialState.levels and ACPIState.levels arrays: drivers/gpu/drm/amd/pm/powerplay/si_dpm.c: 4820: table->initialState.levels[0].mclk.vDLL_CNTL = 4821: cpu_to_be32(si_pi->clock_registers.dll_cntl); ... 5021: table->ACPIState.levels[0].mclk.vDLL_CNTL = 5022: cpu_to_be32(dll_cntl); because such element cannot be accessed without previously allocating enough dynamic memory for it to exist (which never actually happens). So, there is an out-of-bounds bug in this case. That's why struct SISLANDS_SMC_SWSTATE should only be used as type for object driverState and new struct SISLANDS_SMC_SWSTATE_SINGLE is created as type for objects initialState, ACPIState and ULVState. Also, with the change from one-element array to flexible-array member in commit 0e1aa13ca3ff ("drm/amd/pm: Replace one-element array with flexible-array in struct SISLANDS_SMC_SWSTATE"), the size of dpmLevels in struct SISLANDS_SMC_STATETABLE should be fixed to be SISLANDS_MAX_SMC_PERFORMANCE_LEVELS_PER_SWSTATE instead of SISLANDS_MAX_SMC_PERFORMANCE_LEVELS_PER_SWSTATE - 1. Fixes: 0e1aa13ca3ff ("drm/amd/pm: Replace one-element array with flexible-array in struct SISLANDS_SMC_SWSTATE") Cc: stable@vger.kernel.org Signed-off-by: Gustavo A. R. Silva Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/powerplay/si_dpm.c | 174 +++++++++--------- .../gpu/drm/amd/pm/powerplay/sislands_smc.h | 34 ++-- 2 files changed, 109 insertions(+), 99 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/powerplay/si_dpm.c b/drivers/gpu/drm/amd/pm/powerplay/si_dpm.c index 26a5321e621bf..15c0b8af376f8 100644 --- a/drivers/gpu/drm/amd/pm/powerplay/si_dpm.c +++ b/drivers/gpu/drm/amd/pm/powerplay/si_dpm.c @@ -4817,70 +4817,70 @@ static int si_populate_smc_initial_state(struct amdgpu_device *adev, u32 reg; int ret; - table->initialState.levels[0].mclk.vDLL_CNTL = + table->initialState.level.mclk.vDLL_CNTL = cpu_to_be32(si_pi->clock_registers.dll_cntl); - table->initialState.levels[0].mclk.vMCLK_PWRMGT_CNTL = + table->initialState.level.mclk.vMCLK_PWRMGT_CNTL = cpu_to_be32(si_pi->clock_registers.mclk_pwrmgt_cntl); - table->initialState.levels[0].mclk.vMPLL_AD_FUNC_CNTL = + table->initialState.level.mclk.vMPLL_AD_FUNC_CNTL = cpu_to_be32(si_pi->clock_registers.mpll_ad_func_cntl); - table->initialState.levels[0].mclk.vMPLL_DQ_FUNC_CNTL = + table->initialState.level.mclk.vMPLL_DQ_FUNC_CNTL = cpu_to_be32(si_pi->clock_registers.mpll_dq_func_cntl); - table->initialState.levels[0].mclk.vMPLL_FUNC_CNTL = + table->initialState.level.mclk.vMPLL_FUNC_CNTL = cpu_to_be32(si_pi->clock_registers.mpll_func_cntl); - table->initialState.levels[0].mclk.vMPLL_FUNC_CNTL_1 = + table->initialState.level.mclk.vMPLL_FUNC_CNTL_1 = cpu_to_be32(si_pi->clock_registers.mpll_func_cntl_1); - table->initialState.levels[0].mclk.vMPLL_FUNC_CNTL_2 = + table->initialState.level.mclk.vMPLL_FUNC_CNTL_2 = cpu_to_be32(si_pi->clock_registers.mpll_func_cntl_2); - table->initialState.levels[0].mclk.vMPLL_SS = + table->initialState.level.mclk.vMPLL_SS = cpu_to_be32(si_pi->clock_registers.mpll_ss1); - table->initialState.levels[0].mclk.vMPLL_SS2 = + table->initialState.level.mclk.vMPLL_SS2 = cpu_to_be32(si_pi->clock_registers.mpll_ss2); - table->initialState.levels[0].mclk.mclk_value = + table->initialState.level.mclk.mclk_value = cpu_to_be32(initial_state->performance_levels[0].mclk); - table->initialState.levels[0].sclk.vCG_SPLL_FUNC_CNTL = + table->initialState.level.sclk.vCG_SPLL_FUNC_CNTL = cpu_to_be32(si_pi->clock_registers.cg_spll_func_cntl); - table->initialState.levels[0].sclk.vCG_SPLL_FUNC_CNTL_2 = + table->initialState.level.sclk.vCG_SPLL_FUNC_CNTL_2 = cpu_to_be32(si_pi->clock_registers.cg_spll_func_cntl_2); - table->initialState.levels[0].sclk.vCG_SPLL_FUNC_CNTL_3 = + table->initialState.level.sclk.vCG_SPLL_FUNC_CNTL_3 = cpu_to_be32(si_pi->clock_registers.cg_spll_func_cntl_3); - table->initialState.levels[0].sclk.vCG_SPLL_FUNC_CNTL_4 = + table->initialState.level.sclk.vCG_SPLL_FUNC_CNTL_4 = cpu_to_be32(si_pi->clock_registers.cg_spll_func_cntl_4); - table->initialState.levels[0].sclk.vCG_SPLL_SPREAD_SPECTRUM = + table->initialState.level.sclk.vCG_SPLL_SPREAD_SPECTRUM = cpu_to_be32(si_pi->clock_registers.cg_spll_spread_spectrum); - table->initialState.levels[0].sclk.vCG_SPLL_SPREAD_SPECTRUM_2 = + table->initialState.level.sclk.vCG_SPLL_SPREAD_SPECTRUM_2 = cpu_to_be32(si_pi->clock_registers.cg_spll_spread_spectrum_2); - table->initialState.levels[0].sclk.sclk_value = + table->initialState.level.sclk.sclk_value = cpu_to_be32(initial_state->performance_levels[0].sclk); - table->initialState.levels[0].arbRefreshState = + table->initialState.level.arbRefreshState = SISLANDS_INITIAL_STATE_ARB_INDEX; - table->initialState.levels[0].ACIndex = 0; + table->initialState.level.ACIndex = 0; ret = si_populate_voltage_value(adev, &eg_pi->vddc_voltage_table, initial_state->performance_levels[0].vddc, - &table->initialState.levels[0].vddc); + &table->initialState.level.vddc); if (!ret) { u16 std_vddc; ret = si_get_std_voltage_value(adev, - &table->initialState.levels[0].vddc, + &table->initialState.level.vddc, &std_vddc); if (!ret) si_populate_std_voltage_value(adev, std_vddc, - table->initialState.levels[0].vddc.index, - &table->initialState.levels[0].std_vddc); + table->initialState.level.vddc.index, + &table->initialState.level.std_vddc); } if (eg_pi->vddci_control) si_populate_voltage_value(adev, &eg_pi->vddci_voltage_table, initial_state->performance_levels[0].vddci, - &table->initialState.levels[0].vddci); + &table->initialState.level.vddci); if (si_pi->vddc_phase_shed_control) si_populate_phase_shedding_value(adev, @@ -4888,41 +4888,41 @@ static int si_populate_smc_initial_state(struct amdgpu_device *adev, initial_state->performance_levels[0].vddc, initial_state->performance_levels[0].sclk, initial_state->performance_levels[0].mclk, - &table->initialState.levels[0].vddc); + &table->initialState.level.vddc); - si_populate_initial_mvdd_value(adev, &table->initialState.levels[0].mvdd); + si_populate_initial_mvdd_value(adev, &table->initialState.level.mvdd); reg = CG_R(0xffff) | CG_L(0); - table->initialState.levels[0].aT = cpu_to_be32(reg); - table->initialState.levels[0].bSP = cpu_to_be32(pi->dsp); - table->initialState.levels[0].gen2PCIE = (u8)si_pi->boot_pcie_gen; + table->initialState.level.aT = cpu_to_be32(reg); + table->initialState.level.bSP = cpu_to_be32(pi->dsp); + table->initialState.level.gen2PCIE = (u8)si_pi->boot_pcie_gen; if (adev->gmc.vram_type == AMDGPU_VRAM_TYPE_GDDR5) { - table->initialState.levels[0].strobeMode = + table->initialState.level.strobeMode = si_get_strobe_mode_settings(adev, initial_state->performance_levels[0].mclk); if (initial_state->performance_levels[0].mclk > pi->mclk_edc_enable_threshold) - table->initialState.levels[0].mcFlags = SISLANDS_SMC_MC_EDC_RD_FLAG | SISLANDS_SMC_MC_EDC_WR_FLAG; + table->initialState.level.mcFlags = SISLANDS_SMC_MC_EDC_RD_FLAG | SISLANDS_SMC_MC_EDC_WR_FLAG; else - table->initialState.levels[0].mcFlags = 0; + table->initialState.level.mcFlags = 0; } table->initialState.levelCount = 1; table->initialState.flags |= PPSMC_SWSTATE_FLAG_DC; - table->initialState.levels[0].dpm2.MaxPS = 0; - table->initialState.levels[0].dpm2.NearTDPDec = 0; - table->initialState.levels[0].dpm2.AboveSafeInc = 0; - table->initialState.levels[0].dpm2.BelowSafeInc = 0; - table->initialState.levels[0].dpm2.PwrEfficiencyRatio = 0; + table->initialState.level.dpm2.MaxPS = 0; + table->initialState.level.dpm2.NearTDPDec = 0; + table->initialState.level.dpm2.AboveSafeInc = 0; + table->initialState.level.dpm2.BelowSafeInc = 0; + table->initialState.level.dpm2.PwrEfficiencyRatio = 0; reg = MIN_POWER_MASK | MAX_POWER_MASK; - table->initialState.levels[0].SQPowerThrottle = cpu_to_be32(reg); + table->initialState.level.SQPowerThrottle = cpu_to_be32(reg); reg = MAX_POWER_DELTA_MASK | STI_SIZE_MASK | LTI_RATIO_MASK; - table->initialState.levels[0].SQPowerThrottle_2 = cpu_to_be32(reg); + table->initialState.level.SQPowerThrottle_2 = cpu_to_be32(reg); return 0; } @@ -4953,18 +4953,18 @@ static int si_populate_smc_acpi_state(struct amdgpu_device *adev, if (pi->acpi_vddc) { ret = si_populate_voltage_value(adev, &eg_pi->vddc_voltage_table, - pi->acpi_vddc, &table->ACPIState.levels[0].vddc); + pi->acpi_vddc, &table->ACPIState.level.vddc); if (!ret) { u16 std_vddc; ret = si_get_std_voltage_value(adev, - &table->ACPIState.levels[0].vddc, &std_vddc); + &table->ACPIState.level.vddc, &std_vddc); if (!ret) si_populate_std_voltage_value(adev, std_vddc, - table->ACPIState.levels[0].vddc.index, - &table->ACPIState.levels[0].std_vddc); + table->ACPIState.level.vddc.index, + &table->ACPIState.level.std_vddc); } - table->ACPIState.levels[0].gen2PCIE = si_pi->acpi_pcie_gen; + table->ACPIState.level.gen2PCIE = si_pi->acpi_pcie_gen; if (si_pi->vddc_phase_shed_control) { si_populate_phase_shedding_value(adev, @@ -4972,23 +4972,23 @@ static int si_populate_smc_acpi_state(struct amdgpu_device *adev, pi->acpi_vddc, 0, 0, - &table->ACPIState.levels[0].vddc); + &table->ACPIState.level.vddc); } } else { ret = si_populate_voltage_value(adev, &eg_pi->vddc_voltage_table, - pi->min_vddc_in_table, &table->ACPIState.levels[0].vddc); + pi->min_vddc_in_table, &table->ACPIState.level.vddc); if (!ret) { u16 std_vddc; ret = si_get_std_voltage_value(adev, - &table->ACPIState.levels[0].vddc, &std_vddc); + &table->ACPIState.level.vddc, &std_vddc); if (!ret) si_populate_std_voltage_value(adev, std_vddc, - table->ACPIState.levels[0].vddc.index, - &table->ACPIState.levels[0].std_vddc); + table->ACPIState.level.vddc.index, + &table->ACPIState.level.std_vddc); } - table->ACPIState.levels[0].gen2PCIE = + table->ACPIState.level.gen2PCIE = (u8)amdgpu_get_pcie_gen_support(adev, si_pi->sys_pcie_mask, si_pi->boot_pcie_gen, @@ -5000,14 +5000,14 @@ static int si_populate_smc_acpi_state(struct amdgpu_device *adev, pi->min_vddc_in_table, 0, 0, - &table->ACPIState.levels[0].vddc); + &table->ACPIState.level.vddc); } if (pi->acpi_vddc) { if (eg_pi->acpi_vddci) si_populate_voltage_value(adev, &eg_pi->vddci_voltage_table, eg_pi->acpi_vddci, - &table->ACPIState.levels[0].vddci); + &table->ACPIState.level.vddci); } mclk_pwrmgt_cntl |= MRDCK0_RESET | MRDCK1_RESET; @@ -5018,59 +5018,59 @@ static int si_populate_smc_acpi_state(struct amdgpu_device *adev, spll_func_cntl_2 &= ~SCLK_MUX_SEL_MASK; spll_func_cntl_2 |= SCLK_MUX_SEL(4); - table->ACPIState.levels[0].mclk.vDLL_CNTL = + table->ACPIState.level.mclk.vDLL_CNTL = cpu_to_be32(dll_cntl); - table->ACPIState.levels[0].mclk.vMCLK_PWRMGT_CNTL = + table->ACPIState.level.mclk.vMCLK_PWRMGT_CNTL = cpu_to_be32(mclk_pwrmgt_cntl); - table->ACPIState.levels[0].mclk.vMPLL_AD_FUNC_CNTL = + table->ACPIState.level.mclk.vMPLL_AD_FUNC_CNTL = cpu_to_be32(mpll_ad_func_cntl); - table->ACPIState.levels[0].mclk.vMPLL_DQ_FUNC_CNTL = + table->ACPIState.level.mclk.vMPLL_DQ_FUNC_CNTL = cpu_to_be32(mpll_dq_func_cntl); - table->ACPIState.levels[0].mclk.vMPLL_FUNC_CNTL = + table->ACPIState.level.mclk.vMPLL_FUNC_CNTL = cpu_to_be32(mpll_func_cntl); - table->ACPIState.levels[0].mclk.vMPLL_FUNC_CNTL_1 = + table->ACPIState.level.mclk.vMPLL_FUNC_CNTL_1 = cpu_to_be32(mpll_func_cntl_1); - table->ACPIState.levels[0].mclk.vMPLL_FUNC_CNTL_2 = + table->ACPIState.level.mclk.vMPLL_FUNC_CNTL_2 = cpu_to_be32(mpll_func_cntl_2); - table->ACPIState.levels[0].mclk.vMPLL_SS = + table->ACPIState.level.mclk.vMPLL_SS = cpu_to_be32(si_pi->clock_registers.mpll_ss1); - table->ACPIState.levels[0].mclk.vMPLL_SS2 = + table->ACPIState.level.mclk.vMPLL_SS2 = cpu_to_be32(si_pi->clock_registers.mpll_ss2); - table->ACPIState.levels[0].sclk.vCG_SPLL_FUNC_CNTL = + table->ACPIState.level.sclk.vCG_SPLL_FUNC_CNTL = cpu_to_be32(spll_func_cntl); - table->ACPIState.levels[0].sclk.vCG_SPLL_FUNC_CNTL_2 = + table->ACPIState.level.sclk.vCG_SPLL_FUNC_CNTL_2 = cpu_to_be32(spll_func_cntl_2); - table->ACPIState.levels[0].sclk.vCG_SPLL_FUNC_CNTL_3 = + table->ACPIState.level.sclk.vCG_SPLL_FUNC_CNTL_3 = cpu_to_be32(spll_func_cntl_3); - table->ACPIState.levels[0].sclk.vCG_SPLL_FUNC_CNTL_4 = + table->ACPIState.level.sclk.vCG_SPLL_FUNC_CNTL_4 = cpu_to_be32(spll_func_cntl_4); - table->ACPIState.levels[0].mclk.mclk_value = 0; - table->ACPIState.levels[0].sclk.sclk_value = 0; + table->ACPIState.level.mclk.mclk_value = 0; + table->ACPIState.level.sclk.sclk_value = 0; - si_populate_mvdd_value(adev, 0, &table->ACPIState.levels[0].mvdd); + si_populate_mvdd_value(adev, 0, &table->ACPIState.level.mvdd); if (eg_pi->dynamic_ac_timing) - table->ACPIState.levels[0].ACIndex = 0; + table->ACPIState.level.ACIndex = 0; - table->ACPIState.levels[0].dpm2.MaxPS = 0; - table->ACPIState.levels[0].dpm2.NearTDPDec = 0; - table->ACPIState.levels[0].dpm2.AboveSafeInc = 0; - table->ACPIState.levels[0].dpm2.BelowSafeInc = 0; - table->ACPIState.levels[0].dpm2.PwrEfficiencyRatio = 0; + table->ACPIState.level.dpm2.MaxPS = 0; + table->ACPIState.level.dpm2.NearTDPDec = 0; + table->ACPIState.level.dpm2.AboveSafeInc = 0; + table->ACPIState.level.dpm2.BelowSafeInc = 0; + table->ACPIState.level.dpm2.PwrEfficiencyRatio = 0; reg = MIN_POWER_MASK | MAX_POWER_MASK; - table->ACPIState.levels[0].SQPowerThrottle = cpu_to_be32(reg); + table->ACPIState.level.SQPowerThrottle = cpu_to_be32(reg); reg = MAX_POWER_DELTA_MASK | STI_SIZE_MASK | LTI_RATIO_MASK; - table->ACPIState.levels[0].SQPowerThrottle_2 = cpu_to_be32(reg); + table->ACPIState.level.SQPowerThrottle_2 = cpu_to_be32(reg); return 0; } static int si_populate_ulv_state(struct amdgpu_device *adev, - SISLANDS_SMC_SWSTATE *state) + struct SISLANDS_SMC_SWSTATE_SINGLE *state) { struct evergreen_power_info *eg_pi = evergreen_get_pi(adev); struct si_power_info *si_pi = si_get_pi(adev); @@ -5079,19 +5079,19 @@ static int si_populate_ulv_state(struct amdgpu_device *adev, int ret; ret = si_convert_power_level_to_smc(adev, &ulv->pl, - &state->levels[0]); + &state->level); if (!ret) { if (eg_pi->sclk_deep_sleep) { if (sclk_in_sr <= SCLK_MIN_DEEPSLEEP_FREQ) - state->levels[0].stateFlags |= PPSMC_STATEFLAG_DEEPSLEEP_BYPASS; + state->level.stateFlags |= PPSMC_STATEFLAG_DEEPSLEEP_BYPASS; else - state->levels[0].stateFlags |= PPSMC_STATEFLAG_DEEPSLEEP_THROTTLE; + state->level.stateFlags |= PPSMC_STATEFLAG_DEEPSLEEP_THROTTLE; } if (ulv->one_pcie_lane_in_ulv) state->flags |= PPSMC_SWSTATE_FLAG_PCIE_X1; - state->levels[0].arbRefreshState = (u8)(SISLANDS_ULV_STATE_ARB_INDEX); - state->levels[0].ACIndex = 1; - state->levels[0].std_vddc = state->levels[0].vddc; + state->level.arbRefreshState = (u8)(SISLANDS_ULV_STATE_ARB_INDEX); + state->level.ACIndex = 1; + state->level.std_vddc = state->level.vddc; state->levelCount = 1; state->flags |= PPSMC_SWSTATE_FLAG_DC; @@ -5190,7 +5190,9 @@ static int si_init_smc_table(struct amdgpu_device *adev) if (ret) return ret; - table->driverState = table->initialState; + table->driverState.flags = table->initialState.flags; + table->driverState.levelCount = table->initialState.levelCount; + table->driverState.levels[0] = table->initialState.level; ret = si_do_program_memory_timing_parameters(adev, amdgpu_boot_state, SISLANDS_INITIAL_STATE_ARB_INDEX); @@ -5737,8 +5739,8 @@ static int si_upload_ulv_state(struct amdgpu_device *adev) if (ulv->supported && ulv->pl.vddc) { u32 address = si_pi->state_table_start + offsetof(SISLANDS_SMC_STATETABLE, ULVState); - SISLANDS_SMC_SWSTATE *smc_state = &si_pi->smc_statetable.ULVState; - u32 state_size = sizeof(SISLANDS_SMC_SWSTATE); + struct SISLANDS_SMC_SWSTATE_SINGLE *smc_state = &si_pi->smc_statetable.ULVState; + u32 state_size = sizeof(struct SISLANDS_SMC_SWSTATE_SINGLE); memset(smc_state, 0, state_size); diff --git a/drivers/gpu/drm/amd/pm/powerplay/sislands_smc.h b/drivers/gpu/drm/amd/pm/powerplay/sislands_smc.h index 0f7554052c906..c7dc117a688cb 100644 --- a/drivers/gpu/drm/amd/pm/powerplay/sislands_smc.h +++ b/drivers/gpu/drm/amd/pm/powerplay/sislands_smc.h @@ -191,6 +191,14 @@ struct SISLANDS_SMC_SWSTATE typedef struct SISLANDS_SMC_SWSTATE SISLANDS_SMC_SWSTATE; +struct SISLANDS_SMC_SWSTATE_SINGLE { + uint8_t flags; + uint8_t levelCount; + uint8_t padding2; + uint8_t padding3; + SISLANDS_SMC_HW_PERFORMANCE_LEVEL level; +}; + #define SISLANDS_SMC_VOLTAGEMASK_VDDC 0 #define SISLANDS_SMC_VOLTAGEMASK_MVDD 1 #define SISLANDS_SMC_VOLTAGEMASK_VDDCI 2 @@ -208,19 +216,19 @@ typedef struct SISLANDS_SMC_VOLTAGEMASKTABLE SISLANDS_SMC_VOLTAGEMASKTABLE; struct SISLANDS_SMC_STATETABLE { - uint8_t thermalProtectType; - uint8_t systemFlags; - uint8_t maxVDDCIndexInPPTable; - uint8_t extraFlags; - uint32_t lowSMIO[SISLANDS_MAX_NO_VREG_STEPS]; - SISLANDS_SMC_VOLTAGEMASKTABLE voltageMaskTable; - SISLANDS_SMC_VOLTAGEMASKTABLE phaseMaskTable; - PP_SIslands_DPM2Parameters dpm2Params; - SISLANDS_SMC_SWSTATE initialState; - SISLANDS_SMC_SWSTATE ACPIState; - SISLANDS_SMC_SWSTATE ULVState; - SISLANDS_SMC_SWSTATE driverState; - SISLANDS_SMC_HW_PERFORMANCE_LEVEL dpmLevels[SISLANDS_MAX_SMC_PERFORMANCE_LEVELS_PER_SWSTATE - 1]; + uint8_t thermalProtectType; + uint8_t systemFlags; + uint8_t maxVDDCIndexInPPTable; + uint8_t extraFlags; + uint32_t lowSMIO[SISLANDS_MAX_NO_VREG_STEPS]; + SISLANDS_SMC_VOLTAGEMASKTABLE voltageMaskTable; + SISLANDS_SMC_VOLTAGEMASKTABLE phaseMaskTable; + PP_SIslands_DPM2Parameters dpm2Params; + struct SISLANDS_SMC_SWSTATE_SINGLE initialState; + struct SISLANDS_SMC_SWSTATE_SINGLE ACPIState; + struct SISLANDS_SMC_SWSTATE_SINGLE ULVState; + SISLANDS_SMC_SWSTATE driverState; + SISLANDS_SMC_HW_PERFORMANCE_LEVEL dpmLevels[SISLANDS_MAX_SMC_PERFORMANCE_LEVELS_PER_SWSTATE]; }; typedef struct SISLANDS_SMC_STATETABLE SISLANDS_SMC_STATETABLE; From fe1c97d008f86f672f0e9265f180c22451ca3b9f Mon Sep 17 00:00:00 2001 From: David Ward Date: Mon, 10 May 2021 05:30:39 -0400 Subject: [PATCH 221/263] drm/amd/display: Initialize attribute for hdcp_srm sysfs file It is stored in dynamically allocated memory, so sysfs_bin_attr_init() must be called to initialize it. (Note: "initialization" only sets the .attr.key member in this struct; it does not change the value of any other members.) Otherwise, when CONFIG_DEBUG_LOCK_ALLOC=y this message appears during boot: BUG: key ffff9248900cd148 has not been registered! Fixes: 9037246bb2da ("drm/amd/display: Add sysfs interface for set/get srm") Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/1586 Reported-by: Mikhail Gavrilov Signed-off-by: David Ward Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_hdcp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_hdcp.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_hdcp.c index 616f5b1ea3a88..666796a0067c3 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_hdcp.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_hdcp.c @@ -650,6 +650,7 @@ struct hdcp_workqueue *hdcp_create_workqueue(struct amdgpu_device *adev, struct /* File created at /sys/class/drm/card0/device/hdcp_srm*/ hdcp_work[0].attr = data_attr; + sysfs_bin_attr_init(&hdcp_work[0].attr); if (sysfs_create_bin_file(&adev->dev->kobj, &hdcp_work[0].attr)) DRM_WARN("Failed to create device file hdcp_srm"); From 83a0b8639185f40ab7fc9dd291a057150eb9d238 Mon Sep 17 00:00:00 2001 From: Likun GAO Date: Thu, 29 Apr 2021 14:08:13 +0800 Subject: [PATCH 222/263] drm/amdgpu: add judgement when add ip blocks (v2) Judgement whether to add an sw ip according to the harvest info. v2: fix indentation (Alex) Signed-off-by: Likun Gao Reviewed-by: Guchun Chen Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 15 +++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 28 +++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.h | 1 + drivers/gpu/drm/amd/amdgpu/nv.c | 8 +++++- drivers/gpu/drm/amd/include/amd_shared.h | 6 ++++ 6 files changed, 57 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index dc3a69296321b..264176a01e16a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1006,6 +1006,7 @@ struct amdgpu_device { struct amdgpu_df df; struct amdgpu_ip_block ip_blocks[AMDGPU_MAX_IP_NUM]; + uint32_t harvest_ip_mask; int num_ip_blocks; struct mutex mn_lock; DECLARE_HASHTABLE(mn_hash, 7); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 7d3b546151475..8b2a37bf2adf1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1683,6 +1683,19 @@ int amdgpu_device_ip_block_add(struct amdgpu_device *adev, if (!ip_block_version) return -EINVAL; + switch (ip_block_version->type) { + case AMD_IP_BLOCK_TYPE_VCN: + if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) + return 0; + break; + case AMD_IP_BLOCK_TYPE_JPEG: + if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) + return 0; + break; + default: + break; + } + DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, ip_block_version->funcs->name); @@ -3111,7 +3124,6 @@ bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) return amdgpu_device_asic_has_dc_support(adev->asic_type); } - static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) { struct amdgpu_device *adev = @@ -3276,6 +3288,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, adev->vm_manager.vm_pte_funcs = NULL; adev->vm_manager.vm_pte_num_scheds = 0; adev->gmc.gmc_funcs = NULL; + adev->harvest_ip_mask = 0x0; adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c index b2dbcb4df0208..e1b6f58917599 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c @@ -373,6 +373,34 @@ int amdgpu_discovery_get_ip_version(struct amdgpu_device *adev, int hw_id, return -EINVAL; } +void amdgpu_discovery_harvest_ip(struct amdgpu_device *adev) +{ + struct binary_header *bhdr; + struct harvest_table *harvest_info; + int i; + + bhdr = (struct binary_header *)adev->mman.discovery_bin; + harvest_info = (struct harvest_table *)(adev->mman.discovery_bin + + le16_to_cpu(bhdr->table_list[HARVEST_INFO].offset)); + + for (i = 0; i < 32; i++) { + if (le32_to_cpu(harvest_info->list[i].hw_id) == 0) + break; + + switch (le32_to_cpu(harvest_info->list[i].hw_id)) { + case VCN_HWID: + adev->harvest_ip_mask |= AMD_HARVEST_IP_VCN_MASK; + adev->harvest_ip_mask |= AMD_HARVEST_IP_JPEG_MASK; + break; + case DMU_HWID: + adev->harvest_ip_mask |= AMD_HARVEST_IP_DMU_MASK; + break; + default: + break; + } + } +} + int amdgpu_discovery_get_gfx_info(struct amdgpu_device *adev) { struct binary_header *bhdr; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.h index 8f6183801cb34..1b1ae21b10375 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.h @@ -29,6 +29,7 @@ void amdgpu_discovery_fini(struct amdgpu_device *adev); int amdgpu_discovery_reg_base_init(struct amdgpu_device *adev); +void amdgpu_discovery_harvest_ip(struct amdgpu_device *adev); int amdgpu_discovery_get_ip_version(struct amdgpu_device *adev, int hw_id, int *major, int *minor, int *revision); int amdgpu_discovery_get_gfx_info(struct amdgpu_device *adev); diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c index d54af7f8801bf..428413c860c75 100644 --- a/drivers/gpu/drm/amd/amdgpu/nv.c +++ b/drivers/gpu/drm/amd/amdgpu/nv.c @@ -635,6 +635,8 @@ static int nv_reg_base_init(struct amdgpu_device *adev) goto legacy_init; } + amdgpu_discovery_harvest_ip(adev); + return 0; } @@ -777,7 +779,6 @@ int nv_set_ip_blocks(struct amdgpu_device *adev) amdgpu_device_ip_block_add(adev, &vcn_v3_0_ip_block); if (!amdgpu_sriov_vf(adev)) amdgpu_device_ip_block_add(adev, &jpeg_v3_0_ip_block); - if (adev->enable_mes) amdgpu_device_ip_block_add(adev, &mes_v10_1_ip_block); break; @@ -1149,6 +1150,11 @@ static int nv_common_early_init(void *handle) return -EINVAL; } + if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) + adev->pg_flags &= ~(AMD_PG_SUPPORT_VCN | + AMD_PG_SUPPORT_VCN_DPG | + AMD_PG_SUPPORT_JPEG); + if (amdgpu_sriov_vf(adev)) { amdgpu_virt_init_setting(adev); xgpu_nv_mailbox_set_irq_funcs(adev); diff --git a/drivers/gpu/drm/amd/include/amd_shared.h b/drivers/gpu/drm/amd/include/amd_shared.h index 43ed6291b2b89..9ab706cd07ff4 100644 --- a/drivers/gpu/drm/amd/include/amd_shared.h +++ b/drivers/gpu/drm/amd/include/amd_shared.h @@ -216,6 +216,12 @@ enum PP_FEATURE_MASK { PP_GFX_DCS_MASK = 0x80000, }; +enum amd_harvest_ip_mask { + AMD_HARVEST_IP_VCN_MASK = 0x1, + AMD_HARVEST_IP_JPEG_MASK = 0x2, + AMD_HARVEST_IP_DMU_MASK = 0x4, +}; + enum DC_FEATURE_MASK { DC_FBC_MASK = 0x1, DC_MULTI_MON_PP_MCLK_SWITCH_MASK = 0x2, From 5c1a376823c408efd7de30fc300e687c78627f27 Mon Sep 17 00:00:00 2001 From: Likun Gao Date: Fri, 7 May 2021 13:56:46 +0800 Subject: [PATCH 223/263] drm/amdgpu: update the method for harvest IP for specific SKU Update the method of disabling VCN IP for specific SKU for navi1x ASIC, it will judge whether should add the related IP at the function of amdgpu_device_ip_block_add(). Signed-off-by: Likun Gao Reviewed-by: Guchun Chen Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/nv.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c index 428413c860c75..d290ca0b06da8 100644 --- a/drivers/gpu/drm/amd/amdgpu/nv.c +++ b/drivers/gpu/drm/amd/amdgpu/nv.c @@ -623,6 +623,16 @@ static const struct amdgpu_ip_block_version nv_common_ip_block = .funcs = &nv_common_ip_funcs, }; +static bool nv_is_headless_sku(struct pci_dev *pdev) +{ + if ((pdev->device == 0x731E && + (pdev->revision == 0xC6 || pdev->revision == 0xC7)) || + (pdev->device == 0x7340 && pdev->revision == 0xC9) || + (pdev->device == 0x7360 && pdev->revision == 0xC7)) + return true; + return false; +} + static int nv_reg_base_init(struct amdgpu_device *adev) { int r; @@ -636,6 +646,10 @@ static int nv_reg_base_init(struct amdgpu_device *adev) } amdgpu_discovery_harvest_ip(adev); + if (nv_is_headless_sku(adev->pdev)) { + adev->harvest_ip_mask |= AMD_HARVEST_IP_VCN_MASK; + adev->harvest_ip_mask |= AMD_HARVEST_IP_JPEG_MASK; + } return 0; } @@ -673,16 +687,6 @@ void nv_set_virt_ops(struct amdgpu_device *adev) adev->virt.ops = &xgpu_nv_virt_ops; } -static bool nv_is_headless_sku(struct pci_dev *pdev) -{ - if ((pdev->device == 0x731E && - (pdev->revision == 0xC6 || pdev->revision == 0xC7)) || - (pdev->device == 0x7340 && pdev->revision == 0xC9) || - (pdev->device == 0x7360 && pdev->revision == 0xC7)) - return true; - return false; -} - int nv_set_ip_blocks(struct amdgpu_device *adev) { int r; @@ -730,8 +734,7 @@ int nv_set_ip_blocks(struct amdgpu_device *adev) if (adev->firmware.load_type == AMDGPU_FW_LOAD_DIRECT && !amdgpu_sriov_vf(adev)) amdgpu_device_ip_block_add(adev, &smu_v11_0_ip_block); - if (!nv_is_headless_sku(adev->pdev)) - amdgpu_device_ip_block_add(adev, &vcn_v2_0_ip_block); + amdgpu_device_ip_block_add(adev, &vcn_v2_0_ip_block); amdgpu_device_ip_block_add(adev, &jpeg_v2_0_ip_block); if (adev->enable_mes) amdgpu_device_ip_block_add(adev, &mes_v10_1_ip_block); @@ -754,8 +757,7 @@ int nv_set_ip_blocks(struct amdgpu_device *adev) if (adev->firmware.load_type == AMDGPU_FW_LOAD_DIRECT && !amdgpu_sriov_vf(adev)) amdgpu_device_ip_block_add(adev, &smu_v11_0_ip_block); - if (!nv_is_headless_sku(adev->pdev)) - amdgpu_device_ip_block_add(adev, &vcn_v2_0_ip_block); + amdgpu_device_ip_block_add(adev, &vcn_v2_0_ip_block); if (!amdgpu_sriov_vf(adev)) amdgpu_device_ip_block_add(adev, &jpeg_v2_0_ip_block); break; From 227545b9a08c68778ddd89428f99c351fc9315ac Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Fri, 30 Apr 2021 12:56:56 +0800 Subject: [PATCH 224/263] drm/radeon/dpm: Disable sclk switching on Oland when two 4K 60Hz monitors are connected Screen flickers rapidly when two 4K 60Hz monitors are in use. This issue doesn't happen when one monitor is 4K 60Hz (pixelclock 594MHz) and another one is 4K 30Hz (pixelclock 297MHz). The issue is gone after setting "power_dpm_force_performance_level" to "high". Following the indication, we found that the issue occurs when sclk is too low. So resolve the issue by disabling sclk switching when there are two monitors requires high pixelclock (> 297MHz). v2: - Only apply the fix to Oland. Signed-off-by: Kai-Heng Feng Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/radeon/radeon.h | 1 + drivers/gpu/drm/radeon/radeon_pm.c | 8 ++++++++ drivers/gpu/drm/radeon/si_dpm.c | 3 +++ 3 files changed, 12 insertions(+) diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index 42281fce552e6..56ed5634cebef 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -1549,6 +1549,7 @@ struct radeon_dpm { void *priv; u32 new_active_crtcs; int new_active_crtc_count; + int high_pixelclock_count; u32 current_active_crtcs; int current_active_crtc_count; bool single_display; diff --git a/drivers/gpu/drm/radeon/radeon_pm.c b/drivers/gpu/drm/radeon/radeon_pm.c index 0c1950f4e146f..3861c0b98fcf3 100644 --- a/drivers/gpu/drm/radeon/radeon_pm.c +++ b/drivers/gpu/drm/radeon/radeon_pm.c @@ -1767,6 +1767,7 @@ static void radeon_pm_compute_clocks_dpm(struct radeon_device *rdev) struct drm_device *ddev = rdev->ddev; struct drm_crtc *crtc; struct radeon_crtc *radeon_crtc; + struct radeon_connector *radeon_connector; if (!rdev->pm.dpm_enabled) return; @@ -1776,6 +1777,7 @@ static void radeon_pm_compute_clocks_dpm(struct radeon_device *rdev) /* update active crtc counts */ rdev->pm.dpm.new_active_crtcs = 0; rdev->pm.dpm.new_active_crtc_count = 0; + rdev->pm.dpm.high_pixelclock_count = 0; if (rdev->num_crtc && rdev->mode_info.mode_config_initialized) { list_for_each_entry(crtc, &ddev->mode_config.crtc_list, head) { @@ -1783,6 +1785,12 @@ static void radeon_pm_compute_clocks_dpm(struct radeon_device *rdev) if (crtc->enabled) { rdev->pm.dpm.new_active_crtcs |= (1 << radeon_crtc->crtc_id); rdev->pm.dpm.new_active_crtc_count++; + if (!radeon_crtc->connector) + continue; + + radeon_connector = to_radeon_connector(radeon_crtc->connector); + if (radeon_connector->pixelclock_for_modeset > 297000) + rdev->pm.dpm.high_pixelclock_count++; } } } diff --git a/drivers/gpu/drm/radeon/si_dpm.c b/drivers/gpu/drm/radeon/si_dpm.c index 2c54c0d7ca5be..3add39c1a6897 100644 --- a/drivers/gpu/drm/radeon/si_dpm.c +++ b/drivers/gpu/drm/radeon/si_dpm.c @@ -2979,6 +2979,9 @@ static void si_apply_state_adjust_rules(struct radeon_device *rdev, (rdev->pdev->device == 0x6605)) { max_sclk = 75000; } + + if (rdev->pm.dpm.high_pixelclock_count > 1) + disable_sclk_switching = true; } if (rps->vce_active) { From 3666f83a11293fd3cbeb3c9e0c3c53a33a48c28b Mon Sep 17 00:00:00 2001 From: Sathishkumar S Date: Mon, 3 May 2021 12:34:10 +0530 Subject: [PATCH 225/263] drm/amdgpu: set vcn mgcg flag for picasso enable vcn mgcg flag for picasso. Signed-off-by: Sathishkumar S Reviewed-by: Leo Liu Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/soc15.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c index d80e12b80c7e5..8e1b9a40839fc 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15.c +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c @@ -1401,7 +1401,8 @@ static int soc15_common_early_init(void *handle) AMD_CG_SUPPORT_MC_MGCG | AMD_CG_SUPPORT_MC_LS | AMD_CG_SUPPORT_SDMA_MGCG | - AMD_CG_SUPPORT_SDMA_LS; + AMD_CG_SUPPORT_SDMA_LS | + AMD_CG_SUPPORT_VCN_MGCG; adev->pg_flags = AMD_PG_SUPPORT_SDMA | AMD_PG_SUPPORT_MMHUB | From 5c1efb5f7682e2072ca5ce12cd616d432604ecc0 Mon Sep 17 00:00:00 2001 From: Sathishkumar S Date: Mon, 3 May 2021 23:57:31 +0530 Subject: [PATCH 226/263] drm/amdgpu: update vcn1.0 Non-DPG suspend sequence update suspend register settings in Non-DPG mode. Signed-off-by: Sathishkumar S Reviewed-by: Leo Liu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c index 51a773a37a354..0c1beefa3e498 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c @@ -1119,10 +1119,10 @@ static int vcn_v1_0_stop_spg_mode(struct amdgpu_device *adev) UVD_LMI_STATUS__WRITE_CLEAN_RAW_MASK; SOC15_WAIT_ON_RREG(UVD, 0, mmUVD_LMI_STATUS, tmp, tmp); - /* put VCPU into reset */ - WREG32_P(SOC15_REG_OFFSET(UVD, 0, mmUVD_SOFT_RESET), - UVD_SOFT_RESET__VCPU_SOFT_RESET_MASK, - ~UVD_SOFT_RESET__VCPU_SOFT_RESET_MASK); + /* stall UMC channel */ + WREG32_P(SOC15_REG_OFFSET(UVD, 0, mmUVD_LMI_CTRL2), + UVD_LMI_CTRL2__STALL_ARB_UMC_MASK, + ~UVD_LMI_CTRL2__STALL_ARB_UMC_MASK); tmp = UVD_LMI_STATUS__UMC_READ_CLEAN_RAW_MASK | UVD_LMI_STATUS__UMC_WRITE_CLEAN_RAW_MASK; @@ -1141,6 +1141,11 @@ static int vcn_v1_0_stop_spg_mode(struct amdgpu_device *adev) UVD_SOFT_RESET__LMI_SOFT_RESET_MASK, ~UVD_SOFT_RESET__LMI_SOFT_RESET_MASK); + /* put VCPU into reset */ + WREG32_P(SOC15_REG_OFFSET(UVD, 0, mmUVD_SOFT_RESET), + UVD_SOFT_RESET__VCPU_SOFT_RESET_MASK, + ~UVD_SOFT_RESET__VCPU_SOFT_RESET_MASK); + WREG32_SOC15(UVD, 0, mmUVD_STATUS, 0); vcn_v1_0_enable_clock_gating(adev); From af44068c581c028fd9897ca75a10fa310d8fc449 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 13 May 2021 16:18:19 +0100 Subject: [PATCH 227/263] arm64: tools: Add __ASM_CPUCAPS_H to the endif in cpucaps.h Anshuman suggested this. Suggested-by: Anshuman Khandual Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20210513151819.12526-1-broonie@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/tools/gen-cpucaps.awk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/tools/gen-cpucaps.awk b/arch/arm64/tools/gen-cpucaps.awk index 18737a1ce0448..00c9e72a200a5 100755 --- a/arch/arm64/tools/gen-cpucaps.awk +++ b/arch/arm64/tools/gen-cpucaps.awk @@ -31,7 +31,7 @@ BEGIN { END { printf("#define ARM64_NCAPS\t\t\t\t%d\n", cap_num) print "" - print "#endif" + print "#endif /* __ASM_CPUCAPS_H */" } # Any lines not handled by previous rules are unexpected From d4d0ad57b3865795c4cde2fb5094c594c2e8f469 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Thu, 13 May 2021 11:51:41 +0200 Subject: [PATCH 228/263] vgacon: Record video mode changes with VT_RESIZEX Fix an issue with VGA console font size changes made after the initial video text mode has been changed with a user tool like `svgatextmode' calling the VT_RESIZEX ioctl. As it stands in that case the original screen geometry continues being used to validate further VT resizing. Consequently when the video adapter is firstly reprogrammed from the original say 80x25 text mode using a 9x16 character cell (720x400 pixel resolution) to say 80x37 text mode and the same character cell (720x592 pixel resolution), and secondly the CRTC character cell updated to 9x8 (by loading a suitable font with the KD_FONT_OP_SET request of the KDFONTOP ioctl), the VT geometry does not get further updated from 80x37 and only upper half of the screen is used for the VT, with the lower half showing rubbish corresponding to whatever happens to be there in the video memory that maps to that part of the screen. Of course the proportions change according to text mode geometries and font sizes chosen. Address the problem then, by updating the text mode geometry defaults rather than checking against them whenever the VT is resized via a user ioctl. Signed-off-by: Maciej W. Rozycki Fixes: e400b6ec4ede ("vt/vgacon: Check if screen resize request comes from userspace") Cc: stable@vger.kernel.org # v2.6.24+ Signed-off-by: Linus Torvalds --- drivers/video/console/vgacon.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/video/console/vgacon.c b/drivers/video/console/vgacon.c index 962c12be97741..511e7d06b1485 100644 --- a/drivers/video/console/vgacon.c +++ b/drivers/video/console/vgacon.c @@ -1089,12 +1089,20 @@ static int vgacon_resize(struct vc_data *c, unsigned int width, if ((width << 1) * height > vga_vram_size) return -EINVAL; + if (user) { + /* + * Ho ho! Someone (svgatextmode, eh?) may have reprogrammed + * the video mode! Set the new defaults then and go away. + */ + screen_info.orig_video_cols = width; + screen_info.orig_video_lines = height; + vga_default_font_height = c->vc_font.height; + return 0; + } if (width % 2 || width > screen_info.orig_video_cols || height > (screen_info.orig_video_lines * vga_default_font_height)/ c->vc_font.height) - /* let svgatextmode tinker with video timings and - return success */ - return (user) ? 0 : -EINVAL; + return -EINVAL; if (con_is_visible(c) && !vga_is_gfx) /* who knows */ vgacon_doresize(c, width, height); From a90c275eb144c1b755f04769e1f29d832d6daeaf Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Thu, 13 May 2021 11:51:45 +0200 Subject: [PATCH 229/263] vt_ioctl: Revert VT_RESIZEX parameter handling removal Revert the removal of code handling extra VT_RESIZEX ioctl's parameters beyond those that VT_RESIZE supports, fixing a functional regression causing `svgatextmode' not to resize the VT anymore. As a consequence of the reverted change when the video adapter is reprogrammed from the original say 80x25 text mode using a 9x16 character cell (720x400 pixel resolution) to say 80x37 text mode and the same character cell (720x592 pixel resolution), the VT geometry does not get updated and only upper two thirds of the screen are used for the VT, and the lower part remains blank. The proportions change according to text mode geometries chosen. Revert the change verbatim then, bringing back previous VT resizing. Signed-off-by: Maciej W. Rozycki Fixes: 988d0763361b ("vt_ioctl: make VT_RESIZEX behave like VT_RESIZE") Cc: stable@vger.kernel.org # v5.10+ Signed-off-by: Linus Torvalds --- drivers/tty/vt/vt_ioctl.c | 57 ++++++++++++++++++++++++++++++++------- 1 file changed, 47 insertions(+), 10 deletions(-) diff --git a/drivers/tty/vt/vt_ioctl.c b/drivers/tty/vt/vt_ioctl.c index 89aeaf3c1bca6..95d10197566ba 100644 --- a/drivers/tty/vt/vt_ioctl.c +++ b/drivers/tty/vt/vt_ioctl.c @@ -671,21 +671,58 @@ static int vt_resizex(struct vc_data *vc, struct vt_consize __user *cs) if (copy_from_user(&v, cs, sizeof(struct vt_consize))) return -EFAULT; - if (v.v_vlin) - pr_info_once("\"struct vt_consize\"->v_vlin is ignored. Please report if you need this.\n"); - if (v.v_clin) - pr_info_once("\"struct vt_consize\"->v_clin is ignored. Please report if you need this.\n"); + /* FIXME: Should check the copies properly */ + if (!v.v_vlin) + v.v_vlin = vc->vc_scan_lines; + + if (v.v_clin) { + int rows = v.v_vlin / v.v_clin; + if (v.v_rows != rows) { + if (v.v_rows) /* Parameters don't add up */ + return -EINVAL; + v.v_rows = rows; + } + } + + if (v.v_vcol && v.v_ccol) { + int cols = v.v_vcol / v.v_ccol; + if (v.v_cols != cols) { + if (v.v_cols) + return -EINVAL; + v.v_cols = cols; + } + } + + if (v.v_clin > 32) + return -EINVAL; - console_lock(); for (i = 0; i < MAX_NR_CONSOLES; i++) { - vc = vc_cons[i].d; + struct vc_data *vcp; - if (vc) { - vc->vc_resize_user = 1; - vc_resize(vc, v.v_cols, v.v_rows); + if (!vc_cons[i].d) + continue; + console_lock(); + vcp = vc_cons[i].d; + if (vcp) { + int ret; + int save_scan_lines = vcp->vc_scan_lines; + int save_font_height = vcp->vc_font.height; + + if (v.v_vlin) + vcp->vc_scan_lines = v.v_vlin; + if (v.v_clin) + vcp->vc_font.height = v.v_clin; + vcp->vc_resize_user = 1; + ret = vc_resize(vcp, v.v_cols, v.v_rows); + if (ret) { + vcp->vc_scan_lines = save_scan_lines; + vcp->vc_font.height = save_font_height; + console_unlock(); + return ret; + } } + console_unlock(); } - console_unlock(); return 0; } From 860dafa902595fb5f1d23bbcce1215188c3341e6 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Thu, 13 May 2021 11:51:50 +0200 Subject: [PATCH 230/263] vt: Fix character height handling with VT_RESIZEX Restore the original intent of the VT_RESIZEX ioctl's `v_clin' parameter which is the number of pixel rows per character (cell) rather than the height of the font used. For framebuffer devices the two values are always the same, because the former is inferred from the latter one. For VGA used as a true text mode device these two parameters are independent from each other: the number of pixel rows per character is set in the CRT controller, while font height is in fact hardwired to 32 pixel rows and fonts of heights below that value are handled by padding their data with blanks when loaded to hardware for use by the character generator. One can change the setting in the CRT controller and it will update the screen contents accordingly regardless of the font loaded. The `v_clin' parameter is used by the `vgacon' driver to set the height of the character cell and then the cursor position within. Make the parameter explicit then, by defining a new `vc_cell_height' struct member of `vc_data', set it instead of `vc_font.height' from `v_clin' in the VT_RESIZEX ioctl, and then use it throughout the `vgacon' driver except where actual font data is accessed which as noted above is independent from the CRTC setting. This way the framebuffer console driver is free to ignore the `v_clin' parameter as irrelevant, as it always should have, avoiding any issues attempts to give the parameter a meaning there could have caused, such as one that has led to commit 988d0763361b ("vt_ioctl: make VT_RESIZEX behave like VT_RESIZE"): "syzbot is reporting UAF/OOB read at bit_putcs()/soft_cursor() [1][2], for vt_resizex() from ioctl(VT_RESIZEX) allows setting font height larger than actual font height calculated by con_font_set() from ioctl(PIO_FONT). Since fbcon_set_font() from con_font_set() allocates minimal amount of memory based on actual font height calculated by con_font_set(), use of vt_resizex() can cause UAF/OOB read for font data." The problem first appeared around Linux 2.5.66 which predates our repo history, but the origin could be identified with the old MIPS/Linux repo also at: as commit 9736a3546de7 ("Merge with Linux 2.5.66."), where VT_RESIZEX code in `vt_ioctl' was updated as follows: if (clin) - video_font_height = clin; + vc->vc_font.height = clin; making the parameter apply to framebuffer devices as well, perhaps due to the use of "font" in the name of the original `video_font_height' variable. Use "cell" in the new struct member then to avoid ambiguity. References: [1] https://syzkaller.appspot.com/bug?id=32577e96d88447ded2d3b76d71254fb855245837 [2] https://syzkaller.appspot.com/bug?id=6b8355d27b2b94fb5cedf4655e3a59162d9e48e3 Signed-off-by: Maciej W. Rozycki Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Cc: stable@vger.kernel.org # v2.6.12+ Signed-off-by: Linus Torvalds --- drivers/tty/vt/vt_ioctl.c | 6 ++--- drivers/video/console/vgacon.c | 44 +++++++++++++++++----------------- include/linux/console_struct.h | 1 + 3 files changed, 26 insertions(+), 25 deletions(-) diff --git a/drivers/tty/vt/vt_ioctl.c b/drivers/tty/vt/vt_ioctl.c index 95d10197566ba..0e0cd9e9e589e 100644 --- a/drivers/tty/vt/vt_ioctl.c +++ b/drivers/tty/vt/vt_ioctl.c @@ -706,17 +706,17 @@ static int vt_resizex(struct vc_data *vc, struct vt_consize __user *cs) if (vcp) { int ret; int save_scan_lines = vcp->vc_scan_lines; - int save_font_height = vcp->vc_font.height; + int save_cell_height = vcp->vc_cell_height; if (v.v_vlin) vcp->vc_scan_lines = v.v_vlin; if (v.v_clin) - vcp->vc_font.height = v.v_clin; + vcp->vc_cell_height = v.v_clin; vcp->vc_resize_user = 1; ret = vc_resize(vcp, v.v_cols, v.v_rows); if (ret) { vcp->vc_scan_lines = save_scan_lines; - vcp->vc_font.height = save_font_height; + vcp->vc_cell_height = save_cell_height; console_unlock(); return ret; } diff --git a/drivers/video/console/vgacon.c b/drivers/video/console/vgacon.c index 511e7d06b1485..631eb918f8e14 100644 --- a/drivers/video/console/vgacon.c +++ b/drivers/video/console/vgacon.c @@ -383,7 +383,7 @@ static void vgacon_init(struct vc_data *c, int init) vc_resize(c, vga_video_num_columns, vga_video_num_lines); c->vc_scan_lines = vga_scan_lines; - c->vc_font.height = vga_video_font_height; + c->vc_font.height = c->vc_cell_height = vga_video_font_height; c->vc_complement_mask = 0x7700; if (vga_512_chars) c->vc_hi_font_mask = 0x0800; @@ -518,32 +518,32 @@ static void vgacon_cursor(struct vc_data *c, int mode) switch (CUR_SIZE(c->vc_cursor_type)) { case CUR_UNDERLINE: vgacon_set_cursor_size(c->state.x, - c->vc_font.height - - (c->vc_font.height < + c->vc_cell_height - + (c->vc_cell_height < 10 ? 2 : 3), - c->vc_font.height - - (c->vc_font.height < + c->vc_cell_height - + (c->vc_cell_height < 10 ? 1 : 2)); break; case CUR_TWO_THIRDS: vgacon_set_cursor_size(c->state.x, - c->vc_font.height / 3, - c->vc_font.height - - (c->vc_font.height < + c->vc_cell_height / 3, + c->vc_cell_height - + (c->vc_cell_height < 10 ? 1 : 2)); break; case CUR_LOWER_THIRD: vgacon_set_cursor_size(c->state.x, - (c->vc_font.height * 2) / 3, - c->vc_font.height - - (c->vc_font.height < + (c->vc_cell_height * 2) / 3, + c->vc_cell_height - + (c->vc_cell_height < 10 ? 1 : 2)); break; case CUR_LOWER_HALF: vgacon_set_cursor_size(c->state.x, - c->vc_font.height / 2, - c->vc_font.height - - (c->vc_font.height < + c->vc_cell_height / 2, + c->vc_cell_height - + (c->vc_cell_height < 10 ? 1 : 2)); break; case CUR_NONE: @@ -554,7 +554,7 @@ static void vgacon_cursor(struct vc_data *c, int mode) break; default: vgacon_set_cursor_size(c->state.x, 1, - c->vc_font.height); + c->vc_cell_height); break; } break; @@ -565,13 +565,13 @@ static int vgacon_doresize(struct vc_data *c, unsigned int width, unsigned int height) { unsigned long flags; - unsigned int scanlines = height * c->vc_font.height; + unsigned int scanlines = height * c->vc_cell_height; u8 scanlines_lo = 0, r7 = 0, vsync_end = 0, mode, max_scan; raw_spin_lock_irqsave(&vga_lock, flags); vgacon_xres = width * VGA_FONTWIDTH; - vgacon_yres = height * c->vc_font.height; + vgacon_yres = height * c->vc_cell_height; if (vga_video_type >= VIDEO_TYPE_VGAC) { outb_p(VGA_CRTC_MAX_SCAN, vga_video_port_reg); max_scan = inb_p(vga_video_port_val); @@ -626,9 +626,9 @@ static int vgacon_doresize(struct vc_data *c, static int vgacon_switch(struct vc_data *c) { int x = c->vc_cols * VGA_FONTWIDTH; - int y = c->vc_rows * c->vc_font.height; + int y = c->vc_rows * c->vc_cell_height; int rows = screen_info.orig_video_lines * vga_default_font_height/ - c->vc_font.height; + c->vc_cell_height; /* * We need to save screen size here as it's the only way * we can spot the screen has been resized and we need to @@ -1041,7 +1041,7 @@ static int vgacon_adjust_height(struct vc_data *vc, unsigned fontheight) cursor_size_lastto = 0; c->vc_sw->con_cursor(c, CM_DRAW); } - c->vc_font.height = fontheight; + c->vc_font.height = c->vc_cell_height = fontheight; vc_resize(c, 0, rows); /* Adjust console size */ } } @@ -1096,12 +1096,12 @@ static int vgacon_resize(struct vc_data *c, unsigned int width, */ screen_info.orig_video_cols = width; screen_info.orig_video_lines = height; - vga_default_font_height = c->vc_font.height; + vga_default_font_height = c->vc_cell_height; return 0; } if (width % 2 || width > screen_info.orig_video_cols || height > (screen_info.orig_video_lines * vga_default_font_height)/ - c->vc_font.height) + c->vc_cell_height) return -EINVAL; if (con_is_visible(c) && !vga_is_gfx) /* who knows */ diff --git a/include/linux/console_struct.h b/include/linux/console_struct.h index 153734816b49c..d5b9c8d40c18e 100644 --- a/include/linux/console_struct.h +++ b/include/linux/console_struct.h @@ -101,6 +101,7 @@ struct vc_data { unsigned int vc_rows; unsigned int vc_size_row; /* Bytes per row */ unsigned int vc_scan_lines; /* # of scan lines */ + unsigned int vc_cell_height; /* CRTC character cell height */ unsigned long vc_origin; /* [!] Start of real screen */ unsigned long vc_scr_end; /* [!] End of real screen */ unsigned long vc_visible_origin; /* [!] Top of visible window */ From eb01f5353bdaa59600b29d864819056a0e3de24d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 13 May 2021 12:23:24 -0400 Subject: [PATCH 231/263] tracing: Handle %.*s in trace_check_vprintf() If a trace event uses the %*.s notation, the trace_check_vprintf() will fail and will warn about a bad processing of strings, because it does not take into account the length field when processing the star (*) part. Have it handle this case as well. Link: https://lore.kernel.org/linux-nfs/238C0E2D-C2A4-4578-ADD2-C565B3B99842@oracle.com/ Reported-by: Chuck Lever III Fixes: 9a6944fee68e2 ("tracing: Add a verifier to check string pointers for trace events") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 560e4c8d3825b..a21ef9cd2aae2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3704,6 +3704,9 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, goto print; while (*p) { + bool star = false; + int len = 0; + j = 0; /* We only care about %s and variants */ @@ -3725,13 +3728,17 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, /* Need to test cases like %08.*s */ for (j = 1; p[i+j]; j++) { if (isdigit(p[i+j]) || - p[i+j] == '*' || p[i+j] == '.') continue; + if (p[i+j] == '*') { + star = true; + continue; + } break; } if (p[i+j] == 's') break; + star = false; } j = 0; } @@ -3744,6 +3751,9 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, iter->fmt[i] = '\0'; trace_seq_vprintf(&iter->seq, iter->fmt, ap); + if (star) + len = va_arg(ap, int); + /* The ap now points to the string data of the %s */ str = va_arg(ap, const char *); @@ -3762,8 +3772,18 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, int ret; /* Try to safely read the string */ - ret = strncpy_from_kernel_nofault(iter->fmt, str, - iter->fmt_size); + if (star) { + if (len + 1 > iter->fmt_size) + len = iter->fmt_size - 1; + if (len < 0) + len = 0; + ret = copy_from_kernel_nofault(iter->fmt, str, len); + iter->fmt[len] = 0; + star = false; + } else { + ret = strncpy_from_kernel_nofault(iter->fmt, str, + iter->fmt_size); + } if (ret < 0) trace_seq_printf(&iter->seq, "(0x%px)", str); else @@ -3775,7 +3795,10 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, strncpy(iter->fmt, p + i, j + 1); iter->fmt[j+1] = '\0'; } - trace_seq_printf(&iter->seq, iter->fmt, str); + if (star) + trace_seq_printf(&iter->seq, iter->fmt, len, str); + else + trace_seq_printf(&iter->seq, iter->fmt, str); p += i + j + 1; } From 8ec7791bae1327b1c279c5cd6e929c3b12daaf0a Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 6 May 2021 14:49:58 +1000 Subject: [PATCH 232/263] powerpc/64s: Fix crashes when toggling stf barrier The STF (store-to-load forwarding) barrier mitigation can be enabled/disabled at runtime via a debugfs file (stf_barrier), which causes the kernel to patch itself to enable/disable the relevant mitigations. However depending on which mitigation we're using, it may not be safe to do that patching while other CPUs are active. For example the following crash: User access of kernel address (c00000003fff5af0) - exploit attempt? (uid: 0) segfault (11) at c00000003fff5af0 nip 7fff8ad12198 lr 7fff8ad121f8 code 1 code: 40820128 e93c00d0 e9290058 7c292840 40810058 38600000 4bfd9a81 e8410018 code: 2c030006 41810154 3860ffb6 e9210098 7d295279 39400000 40820a3c Shows that we returned to userspace without restoring the user r13 value, due to executing the partially patched STF exit code. Fix it by doing the patching under stop machine. The CPUs that aren't doing the patching will be spinning in the core of the stop machine logic. That is currently sufficient for our purposes, because none of the patching we do is to that code or anywhere in the vicinity. Fixes: a048a07d7f45 ("powerpc/64s: Add support for a store forwarding barrier at kernel entry/exit") Cc: stable@vger.kernel.org # v4.17+ Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210506044959.1298123-1-mpe@ellerman.id.au --- arch/powerpc/lib/feature-fixups.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c index 1fd31b4b0e139..10083add8b336 100644 --- a/arch/powerpc/lib/feature-fixups.c +++ b/arch/powerpc/lib/feature-fixups.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -227,11 +228,25 @@ static void do_stf_exit_barrier_fixups(enum stf_barrier_type types) : "unknown"); } +static int __do_stf_barrier_fixups(void *data) +{ + enum stf_barrier_type *types = data; + + do_stf_entry_barrier_fixups(*types); + do_stf_exit_barrier_fixups(*types); + + return 0; +} void do_stf_barrier_fixups(enum stf_barrier_type types) { - do_stf_entry_barrier_fixups(types); - do_stf_exit_barrier_fixups(types); + /* + * The call to the fallback entry flush, and the fallback/sync-ori exit + * flush can not be safely patched in/out while other CPUs are executing + * them. So call __do_stf_barrier_fixups() on one CPU while all other CPUs + * spin in the stop machine core with interrupts hard disabled. + */ + stop_machine(__do_stf_barrier_fixups, &types, NULL); } void do_uaccess_flush_fixups(enum l1d_flush_type types) From aec86b052df6541cc97c5fca44e5934cbea4963b Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 6 May 2021 14:49:59 +1000 Subject: [PATCH 233/263] powerpc/64s: Fix crashes when toggling entry flush barrier The entry flush mitigation can be enabled/disabled at runtime via a debugfs file (entry_flush), which causes the kernel to patch itself to enable/disable the relevant mitigations. However depending on which mitigation we're using, it may not be safe to do that patching while other CPUs are active. For example the following crash: sleeper[15639]: segfault (11) at c000000000004c20 nip c000000000004c20 lr c000000000004c20 Shows that we returned to userspace with a corrupted LR that points into the kernel, due to executing the partially patched call to the fallback entry flush (ie. we missed the LR restore). Fix it by doing the patching under stop machine. The CPUs that aren't doing the patching will be spinning in the core of the stop machine logic. That is currently sufficient for our purposes, because none of the patching we do is to that code or anywhere in the vicinity. Fixes: f79643787e0a ("powerpc/64s: flush L1D on kernel entry") Cc: stable@vger.kernel.org # v5.10+ Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210506044959.1298123-2-mpe@ellerman.id.au --- arch/powerpc/lib/feature-fixups.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c index 10083add8b336..0aefa6a4a259b 100644 --- a/arch/powerpc/lib/feature-fixups.c +++ b/arch/powerpc/lib/feature-fixups.c @@ -299,8 +299,9 @@ void do_uaccess_flush_fixups(enum l1d_flush_type types) : "unknown"); } -void do_entry_flush_fixups(enum l1d_flush_type types) +static int __do_entry_flush_fixups(void *data) { + enum l1d_flush_type types = *(enum l1d_flush_type *)data; unsigned int instrs[3], *dest; long *start, *end; int i; @@ -369,6 +370,19 @@ void do_entry_flush_fixups(enum l1d_flush_type types) : "ori type" : (types & L1D_FLUSH_MTTRIG) ? "mttrig type" : "unknown"); + + return 0; +} + +void do_entry_flush_fixups(enum l1d_flush_type types) +{ + /* + * The call to the fallback flush can not be safely patched in/out while + * other CPUs are executing it. So call __do_entry_flush_fixups() on one + * CPU while all other CPUs spin in the stop machine core with interrupts + * hard disabled. + */ + stop_machine(__do_entry_flush_fixups, &types, NULL); } void do_rfi_flush_fixups(enum l1d_flush_type types) From 49b39ec248af863781a13aa6d81c5f69a2928094 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 14 May 2021 00:07:59 +1000 Subject: [PATCH 234/263] powerpc/64s: Fix entry flush patching w/strict RWX & hash The entry flush mitigation can be enabled/disabled at runtime. When this happens it results in the kernel patching its own instructions to enable/disable the mitigation sequence. With strict kernel RWX enabled instruction patching happens via a secondary mapping of the kernel text, so that we don't have to make the primary mapping writable. With the hash MMU this leads to a hash fault, which causes us to execute the exception entry which contains the entry flush mitigation. This means we end up executing the entry flush in a semi-patched state, ie. after we have patched the first instruction but before we patch the second or third instruction of the sequence. On machines with updated firmware the entry flush is a series of special nops, and it's safe to to execute in a semi-patched state. However when using the fallback flush the sequence is mflr/branch/mtlr, and so it's not safe to execute if we have patched out the mflr but not the other two instructions. Doing so leads to us corrputing LR, leading to an oops, for example: # echo 0 > /sys/kernel/debug/powerpc/entry_flush kernel tried to execute exec-protected page (c000000002971000) - exploit attempt? (uid: 0) BUG: Unable to handle kernel instruction fetch Faulting instruction address: 0xc000000002971000 Oops: Kernel access of bad area, sig: 11 [#1] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries CPU: 0 PID: 2215 Comm: bash Not tainted 5.13.0-rc1-00010-gda3bb206c9ce #1 NIP: c000000002971000 LR: c000000002971000 CTR: c000000000120c40 REGS: c000000013243840 TRAP: 0400 Not tainted (5.13.0-rc1-00010-gda3bb206c9ce) MSR: 8000000010009033 CR: 48428482 XER: 00000000 ... NIP 0xc000000002971000 LR 0xc000000002971000 Call Trace: do_patch_instruction+0xc4/0x340 (unreliable) do_entry_flush_fixups+0x100/0x3b0 entry_flush_set+0x50/0xe0 simple_attr_write+0x160/0x1a0 full_proxy_write+0x8c/0x110 vfs_write+0xf0/0x340 ksys_write+0x84/0x140 system_call_exception+0x164/0x2d0 system_call_common+0xec/0x278 The simplest fix is to change the order in which we patch the instructions, so that the sequence is always safe to execute. For the non-fallback flushes it doesn't matter what order we patch in. Fixes: bd573a81312f ("powerpc/mm/64s: Allow STRICT_KERNEL_RWX again") Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210513140800.1391706-1-mpe@ellerman.id.au --- arch/powerpc/lib/feature-fixups.c | 59 ++++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c index 0aefa6a4a259b..5d12e37fa8bfd 100644 --- a/arch/powerpc/lib/feature-fixups.c +++ b/arch/powerpc/lib/feature-fixups.c @@ -325,6 +325,31 @@ static int __do_entry_flush_fixups(void *data) if (types & L1D_FLUSH_MTTRIG) instrs[i++] = 0x7c12dba6; /* mtspr TRIG2,r0 (SPR #882) */ + /* + * If we're patching in or out the fallback flush we need to be careful about the + * order in which we patch instructions. That's because it's possible we could + * take a page fault after patching one instruction, so the sequence of + * instructions must be safe even in a half patched state. + * + * To make that work, when patching in the fallback flush we patch in this order: + * - the mflr (dest) + * - the mtlr (dest + 2) + * - the branch (dest + 1) + * + * That ensures the sequence is safe to execute at any point. In contrast if we + * patch the mtlr last, it's possible we could return from the branch and not + * restore LR, leading to a crash later. + * + * When patching out the fallback flush (either with nops or another flush type), + * we patch in this order: + * - the branch (dest + 1) + * - the mtlr (dest + 2) + * - the mflr (dest) + * + * Note we are protected by stop_machine() from other CPUs executing the code in a + * semi-patched state. + */ + start = PTRRELOC(&__start___entry_flush_fixup); end = PTRRELOC(&__stop___entry_flush_fixup); for (i = 0; start < end; start++, i++) { @@ -332,15 +357,16 @@ static int __do_entry_flush_fixups(void *data) pr_devel("patching dest %lx\n", (unsigned long)dest); - patch_instruction((struct ppc_inst *)dest, ppc_inst(instrs[0])); - - if (types == L1D_FLUSH_FALLBACK) - patch_branch((struct ppc_inst *)(dest + 1), (unsigned long)&entry_flush_fallback, - BRANCH_SET_LINK); - else + if (types == L1D_FLUSH_FALLBACK) { + patch_instruction((struct ppc_inst *)dest, ppc_inst(instrs[0])); + patch_instruction((struct ppc_inst *)(dest + 2), ppc_inst(instrs[2])); + patch_branch((struct ppc_inst *)(dest + 1), + (unsigned long)&entry_flush_fallback, BRANCH_SET_LINK); + } else { patch_instruction((struct ppc_inst *)(dest + 1), ppc_inst(instrs[1])); - - patch_instruction((struct ppc_inst *)(dest + 2), ppc_inst(instrs[2])); + patch_instruction((struct ppc_inst *)(dest + 2), ppc_inst(instrs[2])); + patch_instruction((struct ppc_inst *)dest, ppc_inst(instrs[0])); + } } start = PTRRELOC(&__start___scv_entry_flush_fixup); @@ -350,15 +376,16 @@ static int __do_entry_flush_fixups(void *data) pr_devel("patching dest %lx\n", (unsigned long)dest); - patch_instruction((struct ppc_inst *)dest, ppc_inst(instrs[0])); - - if (types == L1D_FLUSH_FALLBACK) - patch_branch((struct ppc_inst *)(dest + 1), (unsigned long)&scv_entry_flush_fallback, - BRANCH_SET_LINK); - else + if (types == L1D_FLUSH_FALLBACK) { + patch_instruction((struct ppc_inst *)dest, ppc_inst(instrs[0])); + patch_instruction((struct ppc_inst *)(dest + 2), ppc_inst(instrs[2])); + patch_branch((struct ppc_inst *)(dest + 1), + (unsigned long)&scv_entry_flush_fallback, BRANCH_SET_LINK); + } else { patch_instruction((struct ppc_inst *)(dest + 1), ppc_inst(instrs[1])); - - patch_instruction((struct ppc_inst *)(dest + 2), ppc_inst(instrs[2])); + patch_instruction((struct ppc_inst *)(dest + 2), ppc_inst(instrs[2])); + patch_instruction((struct ppc_inst *)dest, ppc_inst(instrs[0])); + } } From 5b48ba2fbd77bc68feebd336ffad5ff166782bde Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 14 May 2021 00:08:00 +1000 Subject: [PATCH 235/263] powerpc/64s: Fix stf mitigation patching w/strict RWX & hash The stf entry barrier fallback is unsafe to execute in a semi-patched state, which can happen when enabling/disabling the mitigation with strict kernel RWX enabled and using the hash MMU. See the previous commit for more details. Fix it by changing the order in which we patch the instructions. Note the stf barrier fallback is only used on Power6 or earlier. Fixes: bd573a81312f ("powerpc/mm/64s: Allow STRICT_KERNEL_RWX again") Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210513140800.1391706-2-mpe@ellerman.id.au --- arch/powerpc/lib/feature-fixups.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c index 5d12e37fa8bfd..fe26f2fa0f3f8 100644 --- a/arch/powerpc/lib/feature-fixups.c +++ b/arch/powerpc/lib/feature-fixups.c @@ -150,17 +150,17 @@ static void do_stf_entry_barrier_fixups(enum stf_barrier_type types) pr_devel("patching dest %lx\n", (unsigned long)dest); - patch_instruction((struct ppc_inst *)dest, ppc_inst(instrs[0])); - - if (types & STF_BARRIER_FALLBACK) + // See comment in do_entry_flush_fixups() RE order of patching + if (types & STF_BARRIER_FALLBACK) { + patch_instruction((struct ppc_inst *)dest, ppc_inst(instrs[0])); + patch_instruction((struct ppc_inst *)(dest + 2), ppc_inst(instrs[2])); patch_branch((struct ppc_inst *)(dest + 1), - (unsigned long)&stf_barrier_fallback, - BRANCH_SET_LINK); - else - patch_instruction((struct ppc_inst *)(dest + 1), - ppc_inst(instrs[1])); - - patch_instruction((struct ppc_inst *)(dest + 2), ppc_inst(instrs[2])); + (unsigned long)&stf_barrier_fallback, BRANCH_SET_LINK); + } else { + patch_instruction((struct ppc_inst *)(dest + 1), ppc_inst(instrs[1])); + patch_instruction((struct ppc_inst *)(dest + 2), ppc_inst(instrs[2])); + patch_instruction((struct ppc_inst *)dest, ppc_inst(instrs[0])); + } } printk(KERN_DEBUG "stf-barrier: patched %d entry locations (%s barrier)\n", i, From 4ec5feec1ad029bdf7d49bc50ccc0c195eeabe93 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 3 May 2021 21:17:08 +1000 Subject: [PATCH 236/263] powerpc/64s: Make NMI record implicitly soft-masked code as irqs disabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit scv support introduced the notion of code that implicitly soft-masks irqs due to the instruction addresses. This is required because scv enters the kernel with MSR[EE]=1. If a NMI (including soft-NMI) interrupt hits when we are implicitly soft-masked then its regs->softe does not reflect this because it is derived from the explicit soft mask state (paca->irq_soft_mask). This makes arch_irq_disabled_regs(regs) return false. This can trigger a warning in the soft-NMI watchdog code (shown below). Fix it by having NMI interrupts set regs->softe to disabled in case of interrupting an implicit soft-masked region. ------------[ cut here ]------------ WARNING: CPU: 41 PID: 1103 at arch/powerpc/kernel/watchdog.c:259 soft_nmi_interrupt+0x3e4/0x5f0 CPU: 41 PID: 1103 Comm: (spawn) Not tainted NIP: c000000000039534 LR: c000000000039234 CTR: c000000000009a00 REGS: c000007fffbcf940 TRAP: 0700 Not tainted MSR: 9000000000021033 CR: 22042482 XER: 200400ad CFAR: c000000000039260 IRQMASK: 3 GPR00: c000000000039204 c000007fffbcfbe0 c000000001d6c300 0000000000000003 GPR04: 00007ffffa45d078 0000000000000000 0000000000000008 0000000000000020 GPR08: 0000007ffd4e0000 0000000000000000 c000007ffffceb00 7265677368657265 GPR12: 9000000000009033 c000007ffffceb00 00000f7075bf4480 000000000000002a GPR16: 00000f705745a528 00007ffffa45ddd8 00000f70574d0008 0000000000000000 GPR20: 00000f7075c58d70 00000f7057459c38 0000000000000001 0000000000000040 GPR24: 0000000000000000 0000000000000029 c000000001dae058 0000000000000029 GPR28: 0000000000000000 0000000000000800 0000000000000009 c000007fffbcfd60 NIP [c000000000039534] soft_nmi_interrupt+0x3e4/0x5f0 LR [c000000000039234] soft_nmi_interrupt+0xe4/0x5f0 Call Trace: [c000007fffbcfbe0] [c000000000039204] soft_nmi_interrupt+0xb4/0x5f0 (unreliable) [c000007fffbcfcf0] [c00000000000c0e8] soft_nmi_common+0x138/0x1c4 --- interrupt: 900 at end_real_trampolines+0x0/0x1000 NIP: c000000000003000 LR: 00007ca426adb03c CTR: 900000000280f033 REGS: c000007fffbcfd60 TRAP: 0900 MSR: 9000000000009033 CR: 44042482 XER: 200400ad CFAR: 00007ca426946020 IRQMASK: 0 GPR00: 00000000000000ad 00007ffffa45d050 00007ca426b07f00 0000000000000035 GPR04: 00007ffffa45d078 0000000000000000 0000000000000008 0000000000000020 GPR08: 0000000000000000 0000000000100000 0000000010000000 00007ffffa45d110 GPR12: 0000000000000001 00007ca426d4e680 00000f7075bf4480 000000000000002a GPR16: 00000f705745a528 00007ffffa45ddd8 00000f70574d0008 0000000000000000 GPR20: 00000f7075c58d70 00000f7057459c38 0000000000000001 0000000000000040 GPR24: 0000000000000000 00000f7057473f68 0000000000000003 000000000000041b GPR28: 00007ffffa45d4c4 0000000000000035 0000000000000000 00000f7057473f68 NIP [c000000000003000] end_real_trampolines+0x0/0x1000 LR [00007ca426adb03c] 0x7ca426adb03c --- interrupt: 900 Instruction dump: 60000000 60000000 60420000 38600001 482b3ae5 60000000 e93f0138 a36d0008 7daa6b78 71290001 7f7907b4 4082fd34 <0fe00000> 4bfffd2c 60420000 ea6100a8 ---[ end trace dc75f67d819779da ]--- Fixes: 118178e62e2e ("powerpc: move NMI entry/exit code into wrapper") Reported-by: Cédric Le Goater Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210503111708.758261-1-npiggin@gmail.com --- arch/powerpc/include/asm/interrupt.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h index c77e8f57ff062..59f704408d65d 100644 --- a/arch/powerpc/include/asm/interrupt.h +++ b/arch/powerpc/include/asm/interrupt.h @@ -220,6 +220,13 @@ static inline void interrupt_nmi_enter_prepare(struct pt_regs *regs, struct inte local_paca->irq_soft_mask = IRQS_ALL_DISABLED; local_paca->irq_happened |= PACA_IRQ_HARD_DIS; + if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !(regs->msr & MSR_PR) && + regs->nip < (unsigned long)__end_interrupts) { + // Kernel code running below __end_interrupts is + // implicitly soft-masked. + regs->softe = IRQS_ALL_DISABLED; + } + /* Don't do any per-CPU operations until interrupt state is fixed */ if (nmi_disables_ftrace(regs)) { From c6ac667b07996929835b512de0e9a988977e6abc Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 14 May 2021 14:40:08 +1000 Subject: [PATCH 237/263] powerpc/64e/interrupt: Fix nvgprs being clobbered Some interrupt handlers have an "extra" that saves 1 or 2 registers (r14, r15) in the paca save area and makes them available to use by the handler. The change to always save nvgprs in exception handlers lead to some interrupt handlers saving those scratch r14 / r15 registers into the interrupt frame's GPR saves, which get restored on interrupt exit. Fix this by always reloading those scratch registers from paca before the EXCEPTION_COMMON that saves nvgprs. Fixes: 4228b2c3d20e ("powerpc/64e/interrupt: always save nvgprs on interrupt") Reported-by: Christian Zigotzky Signed-off-by: Nicholas Piggin Tested-by: Christian Zigotzky Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210514044008.1955783-1-npiggin@gmail.com --- arch/powerpc/kernel/exceptions-64e.S | 38 ++++++++++++++++++---------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index 7c3654b0d0f47..f1ae710274bc9 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -340,6 +340,12 @@ ret_from_mc_except: andi. r10,r10,IRQS_DISABLED; /* yes -> go out of line */ \ bne masked_interrupt_book3e_##n +/* + * Additional regs must be re-loaded from paca before EXCEPTION_COMMON* is + * called, because that does SAVE_NVGPRS which must see the original register + * values, otherwise the scratch values might be restored when exiting the + * interrupt. + */ #define PROLOG_ADDITION_2REGS_GEN(n) \ std r14,PACA_EXGEN+EX_R14(r13); \ std r15,PACA_EXGEN+EX_R15(r13) @@ -535,6 +541,10 @@ __end_interrupts: PROLOG_ADDITION_2REGS) mfspr r14,SPRN_DEAR mfspr r15,SPRN_ESR + std r14,_DAR(r1) + std r15,_DSISR(r1) + ld r14,PACA_EXGEN+EX_R14(r13) + ld r15,PACA_EXGEN+EX_R15(r13) EXCEPTION_COMMON(0x300) b storage_fault_common @@ -544,6 +554,10 @@ __end_interrupts: PROLOG_ADDITION_2REGS) li r15,0 mr r14,r10 + std r14,_DAR(r1) + std r15,_DSISR(r1) + ld r14,PACA_EXGEN+EX_R14(r13) + ld r15,PACA_EXGEN+EX_R15(r13) EXCEPTION_COMMON(0x400) b storage_fault_common @@ -557,6 +571,10 @@ __end_interrupts: PROLOG_ADDITION_2REGS) mfspr r14,SPRN_DEAR mfspr r15,SPRN_ESR + std r14,_DAR(r1) + std r15,_DSISR(r1) + ld r14,PACA_EXGEN+EX_R14(r13) + ld r15,PACA_EXGEN+EX_R15(r13) EXCEPTION_COMMON(0x600) b alignment_more /* no room, go out of line */ @@ -565,10 +583,10 @@ __end_interrupts: NORMAL_EXCEPTION_PROLOG(0x700, BOOKE_INTERRUPT_PROGRAM, PROLOG_ADDITION_1REG) mfspr r14,SPRN_ESR - EXCEPTION_COMMON(0x700) std r14,_DSISR(r1) - addi r3,r1,STACK_FRAME_OVERHEAD ld r14,PACA_EXGEN+EX_R14(r13) + EXCEPTION_COMMON(0x700) + addi r3,r1,STACK_FRAME_OVERHEAD bl program_check_exception REST_NVGPRS(r1) b interrupt_return @@ -725,11 +743,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) * normal exception */ mfspr r14,SPRN_DBSR - EXCEPTION_COMMON_CRIT(0xd00) std r14,_DSISR(r1) - addi r3,r1,STACK_FRAME_OVERHEAD ld r14,PACA_EXCRIT+EX_R14(r13) ld r15,PACA_EXCRIT+EX_R15(r13) + EXCEPTION_COMMON_CRIT(0xd00) + addi r3,r1,STACK_FRAME_OVERHEAD bl DebugException REST_NVGPRS(r1) b interrupt_return @@ -796,11 +814,11 @@ kernel_dbg_exc: * normal exception */ mfspr r14,SPRN_DBSR - EXCEPTION_COMMON_DBG(0xd08) std r14,_DSISR(r1) - addi r3,r1,STACK_FRAME_OVERHEAD ld r14,PACA_EXDBG+EX_R14(r13) ld r15,PACA_EXDBG+EX_R15(r13) + EXCEPTION_COMMON_DBG(0xd08) + addi r3,r1,STACK_FRAME_OVERHEAD bl DebugException REST_NVGPRS(r1) b interrupt_return @@ -931,11 +949,7 @@ masked_interrupt_book3e_0x2c0: * original values stashed away in the PACA */ storage_fault_common: - std r14,_DAR(r1) - std r15,_DSISR(r1) addi r3,r1,STACK_FRAME_OVERHEAD - ld r14,PACA_EXGEN+EX_R14(r13) - ld r15,PACA_EXGEN+EX_R15(r13) bl do_page_fault b interrupt_return @@ -944,11 +958,7 @@ storage_fault_common: * continues here. */ alignment_more: - std r14,_DAR(r1) - std r15,_DSISR(r1) addi r3,r1,STACK_FRAME_OVERHEAD - ld r14,PACA_EXGEN+EX_R14(r13) - ld r15,PACA_EXGEN+EX_R15(r13) bl alignment_exception REST_NVGPRS(r1) b interrupt_return From 447c19f3b5074409c794b350b10306e1da1ef4ba Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 14 May 2021 12:02:50 +0100 Subject: [PATCH 238/263] io_uring: fix ltout double free on completion race Always remove linked timeout on io_link_timeout_fn() from the master request link list, otherwise we may get use-after-free when first io_link_timeout_fn() puts linked timeout in the fail path, and then will be found and put on master's free. Cc: stable@vger.kernel.org # 5.10+ Fixes: 90cd7e424969d ("io_uring: track link timeout's master explicitly") Reported-and-tested-by: syzbot+5a864149dd970b546223@syzkaller.appspotmail.com Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/69c46bf6ce37fec4fdcd98f0882e18eb07ce693a.1620990121.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 9ac5e278a91e6..599102cc6dfc2 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6354,10 +6354,11 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) * We don't expect the list to be empty, that will only happen if we * race with the completion of the linked work. */ - if (prev && req_ref_inc_not_zero(prev)) + if (prev) { io_remove_next_linked(prev); - else - prev = NULL; + if (!req_ref_inc_not_zero(prev)) + prev = NULL; + } spin_unlock_irqrestore(&ctx->completion_lock, flags); if (prev) { From 2d74d0421e5afc1e7be7167ffb7eb8b2cf32343a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 14 May 2021 12:05:46 +0100 Subject: [PATCH 239/263] io_uring: further remove sqpoll limits on opcodes There are three types of requests that left disabled for sqpoll, namely epoll ctx, statx, and resources update. Since SQPOLL task is now closely mimics a userspace thread, remove the restrictions. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/909b52d70c45636d8d7897582474ea5aab5eed34.1620990306.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 599102cc6dfc2..29ec5b28c73dc 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4035,7 +4035,7 @@ static int io_epoll_ctl_prep(struct io_kiocb *req, #if defined(CONFIG_EPOLL) if (sqe->ioprio || sqe->buf_index) return -EINVAL; - if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL))) + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; req->epoll.epfd = READ_ONCE(sqe->fd); @@ -4150,7 +4150,7 @@ static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags) static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL))) + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->ioprio || sqe->buf_index) return -EINVAL; @@ -5827,8 +5827,6 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) static int io_rsrc_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - if (unlikely(req->ctx->flags & IORING_SETUP_SQPOLL)) - return -EINVAL; if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) return -EINVAL; if (sqe->ioprio || sqe->rw_flags) From 489809e2e22b3dedc0737163d97eb2b574137b42 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 14 May 2021 12:06:44 +0100 Subject: [PATCH 240/263] io_uring: increase max number of reg buffers Since recent changes instead of storing a large array of struct io_mapped_ubuf, we store pointers to them, that is 4 times slimmer and we should not to so worry about restricting max number of registererd buffer slots, increase the limit 4 times. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/d3dee1da37f46da416aa96a16bf9e5094e10584d.1620990371.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 29ec5b28c73dc..e481ac8a757ad 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -100,6 +100,8 @@ #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ IORING_REGISTER_LAST + IORING_OP_LAST) +#define IORING_MAX_REG_BUFFERS (1U << 14) + #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \ IOSQE_IO_HARDLINK | IOSQE_ASYNC | \ IOSQE_BUFFER_SELECT) @@ -8389,7 +8391,7 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, if (ctx->user_bufs) return -EBUSY; - if (!nr_args || nr_args > UIO_MAXIOV) + if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) return -EINVAL; ret = io_rsrc_node_switch_start(ctx); if (ret) From 3486d2c9be652a31033363bdd50391b0c8a8fe21 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Thu, 13 May 2021 09:32:46 +0200 Subject: [PATCH 241/263] clocksource/drivers/hyper-v: Re-enable VDSO_CLOCKMODE_HVCLOCK on X86 Mohammed reports (https://bugzilla.kernel.org/show_bug.cgi?id=213029) the commit e4ab4658f1cf ("clocksource/drivers/hyper-v: Handle vDSO differences inline") broke vDSO on x86. The problem appears to be that VDSO_CLOCKMODE_HVCLOCK is an enum value in 'enum vdso_clock_mode' and '#ifdef VDSO_CLOCKMODE_HVCLOCK' branch evaluates to false (it is not a define). Use a dedicated HAVE_VDSO_CLOCKMODE_HVCLOCK define instead. Fixes: e4ab4658f1cf ("clocksource/drivers/hyper-v: Handle vDSO differences inline") Reported-by: Mohammed Gamal Suggested-by: Thomas Gleixner Signed-off-by: Vitaly Kuznetsov Signed-off-by: Thomas Gleixner Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20210513073246.1715070-1-vkuznets@redhat.com --- arch/x86/include/asm/vdso/clocksource.h | 2 ++ drivers/clocksource/hyperv_timer.c | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/vdso/clocksource.h b/arch/x86/include/asm/vdso/clocksource.h index 119ac8612d893..136e5e57cfe11 100644 --- a/arch/x86/include/asm/vdso/clocksource.h +++ b/arch/x86/include/asm/vdso/clocksource.h @@ -7,4 +7,6 @@ VDSO_CLOCKMODE_PVCLOCK, \ VDSO_CLOCKMODE_HVCLOCK +#define HAVE_VDSO_CLOCKMODE_HVCLOCK + #endif /* __ASM_VDSO_CLOCKSOURCE_H */ diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c index 977fd05ac35f6..d6ece7bbce894 100644 --- a/drivers/clocksource/hyperv_timer.c +++ b/drivers/clocksource/hyperv_timer.c @@ -419,7 +419,7 @@ static void resume_hv_clock_tsc(struct clocksource *arg) hv_set_register(HV_REGISTER_REFERENCE_TSC, tsc_msr); } -#ifdef VDSO_CLOCKMODE_HVCLOCK +#ifdef HAVE_VDSO_CLOCKMODE_HVCLOCK static int hv_cs_enable(struct clocksource *cs) { vclocks_set_used(VDSO_CLOCKMODE_HVCLOCK); @@ -435,7 +435,7 @@ static struct clocksource hyperv_cs_tsc = { .flags = CLOCK_SOURCE_IS_CONTINUOUS, .suspend= suspend_hv_clock_tsc, .resume = resume_hv_clock_tsc, -#ifdef VDSO_CLOCKMODE_HVCLOCK +#ifdef HAVE_VDSO_CLOCKMODE_HVCLOCK .enable = hv_cs_enable, .vdso_clock_mode = VDSO_CLOCKMODE_HVCLOCK, #else From cb6f6b3384d7825d2a43f2256c5200e3b3956fc8 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Wed, 12 May 2021 13:18:21 -0700 Subject: [PATCH 242/263] xen/arm: move xen_swiotlb_detect to arm/swiotlb-xen.h Move xen_swiotlb_detect to a static inline function to make it available to !CONFIG_XEN builds. CC: boris.ostrovsky@oracle.com CC: jgross@suse.com Signed-off-by: Stefano Stabellini Reviewed-by: Christoph Hellwig Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20210512201823.1963-1-sstabellini@kernel.org Signed-off-by: Juergen Gross --- arch/arm/xen/mm.c | 12 ------------ include/xen/arm/swiotlb-xen.h | 15 ++++++++++++++- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/arch/arm/xen/mm.c b/arch/arm/xen/mm.c index f8f07469d2591..223b1151fd7de 100644 --- a/arch/arm/xen/mm.c +++ b/arch/arm/xen/mm.c @@ -135,18 +135,6 @@ void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order) return; } -int xen_swiotlb_detect(void) -{ - if (!xen_domain()) - return 0; - if (xen_feature(XENFEAT_direct_mapped)) - return 1; - /* legacy case */ - if (!xen_feature(XENFEAT_not_direct_mapped) && xen_initial_domain()) - return 1; - return 0; -} - static int __init xen_mm_init(void) { struct gnttab_cache_flush cflush; diff --git a/include/xen/arm/swiotlb-xen.h b/include/xen/arm/swiotlb-xen.h index 2994fe6031a09..33336ab58afcf 100644 --- a/include/xen/arm/swiotlb-xen.h +++ b/include/xen/arm/swiotlb-xen.h @@ -2,6 +2,19 @@ #ifndef _ASM_ARM_SWIOTLB_XEN_H #define _ASM_ARM_SWIOTLB_XEN_H -extern int xen_swiotlb_detect(void); +#include +#include + +static inline int xen_swiotlb_detect(void) +{ + if (!xen_domain()) + return 0; + if (xen_feature(XENFEAT_direct_mapped)) + return 1; + /* legacy case */ + if (!xen_feature(XENFEAT_not_direct_mapped) && xen_initial_domain()) + return 1; + return 0; +} #endif /* _ASM_ARM_SWIOTLB_XEN_H */ From 687842ec50342b716953f5847a49dd337cb6de8c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 12 May 2021 13:18:22 -0700 Subject: [PATCH 243/263] arm64: do not set SWIOTLB_NO_FORCE when swiotlb is required Although SWIOTLB_NO_FORCE is meant to allow later calls to swiotlb_init, today dma_direct_map_page returns error if SWIOTLB_NO_FORCE. For now, without a larger overhaul of SWIOTLB_NO_FORCE, the best we can do is to avoid setting SWIOTLB_NO_FORCE in mem_init when we know that it is going to be required later (e.g. Xen requires it). CC: boris.ostrovsky@oracle.com CC: jgross@suse.com CC: catalin.marinas@arm.com CC: will@kernel.org CC: linux-arm-kernel@lists.infradead.org Fixes: 2726bf3ff252 ("swiotlb: Make SWIOTLB_NO_FORCE perform no allocation") Signed-off-by: Christoph Hellwig Signed-off-by: Stefano Stabellini Reviewed-by: Juergen Gross Acked-by: Catalin Marinas Link: https://lore.kernel.org/r/20210512201823.1963-2-sstabellini@kernel.org Signed-off-by: Juergen Gross --- arch/arm64/mm/init.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 16a2b2b1c54d4..e55409caaee34 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -43,6 +43,7 @@ #include #include #include +#include /* * We need to be able to catch inadvertent references to memstart_addr @@ -482,7 +483,7 @@ void __init mem_init(void) if (swiotlb_force == SWIOTLB_FORCE || max_pfn > PFN_DOWN(arm64_dma_phys_limit)) swiotlb_init(1); - else + else if (!xen_swiotlb_detect()) swiotlb_force = SWIOTLB_NO_FORCE; set_max_mapnr(max_pfn - PHYS_PFN_OFFSET); From 97729b653de52ba98e08732dd8855586e37a3a31 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Wed, 12 May 2021 13:18:23 -0700 Subject: [PATCH 244/263] xen/swiotlb: check if the swiotlb has already been initialized xen_swiotlb_init calls swiotlb_late_init_with_tbl, which fails with -ENOMEM if the swiotlb has already been initialized. Add an explicit check io_tlb_default_mem != NULL at the beginning of xen_swiotlb_init. If the swiotlb is already initialized print a warning and return -EEXIST. On x86, the error propagates. On ARM, we don't actually need a special swiotlb buffer (yet), any buffer would do. So ignore the error and continue. CC: boris.ostrovsky@oracle.com CC: jgross@suse.com Signed-off-by: Stefano Stabellini Reviewed-by: Boris Ostrovsky Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210512201823.1963-3-sstabellini@kernel.org Signed-off-by: Juergen Gross --- arch/arm/xen/mm.c | 8 +++++++- drivers/xen/swiotlb-xen.c | 5 +++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/arch/arm/xen/mm.c b/arch/arm/xen/mm.c index 223b1151fd7de..a7e54a087b802 100644 --- a/arch/arm/xen/mm.c +++ b/arch/arm/xen/mm.c @@ -138,9 +138,15 @@ void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order) static int __init xen_mm_init(void) { struct gnttab_cache_flush cflush; + int rc; + if (!xen_swiotlb_detect()) return 0; - xen_swiotlb_init(); + + rc = xen_swiotlb_init(); + /* we can work with the default swiotlb */ + if (rc < 0 && rc != -EEXIST) + return rc; cflush.op = 0; cflush.a.dev_bus_addr = 0; diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 4c89afc0df628..24d11861ac7d8 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -164,6 +164,11 @@ int __ref xen_swiotlb_init(void) int rc = -ENOMEM; char *start; + if (io_tlb_default_mem != NULL) { + pr_warn("swiotlb buffer already initialized\n"); + return -EEXIST; + } + retry: m_ret = XEN_SWIOTLB_ENOMEM; order = get_order(bytes); From 03f26d8f11403295de445b6e4e0e57ac57755791 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 14 May 2021 10:20:52 +0800 Subject: [PATCH 245/263] blk-mq: plug request for shared sbitmap In case of shared sbitmap, request won't be held in plug list any more sine commit 32bc15afed04 ("blk-mq: Facilitate a shared sbitmap per tagset"), this way makes request merge from flush plug list & batching submission not possible, so cause performance regression. Yanhui reports performance regression when running sequential IO test(libaio, 16 jobs, 8 depth for each job) in VM, and the VM disk is emulated with image stored on xfs/megaraid_sas. Fix the issue by recovering original behavior to allow to hold request in plug list. Cc: Yanhui Ma Cc: John Garry Cc: Bart Van Assche Cc: kashyap.desai@broadcom.com Fixes: 32bc15afed04 ("blk-mq: Facilitate a shared sbitmap per tagset") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20210514022052.1047665-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 466676bc2f0be..28ef0248efba3 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2232,8 +2232,9 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio) /* Bypass scheduler for flush requests */ blk_insert_flush(rq); blk_mq_run_hw_queue(data.hctx, true); - } else if (plug && (q->nr_hw_queues == 1 || q->mq_ops->commit_rqs || - !blk_queue_nonrot(q))) { + } else if (plug && (q->nr_hw_queues == 1 || + blk_mq_is_sbitmap_shared(rq->mq_hctx->flags) || + q->mq_ops->commit_rqs || !blk_queue_nonrot(q))) { /* * Use plugging if we have a ->commit_rqs() hook as well, as * we know the driver uses bd->last in a smart fashion. From 630ef623ed26c18a457cdc070cf24014e50129c2 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 13 May 2021 10:15:29 -0700 Subject: [PATCH 246/263] blk-mq: Swap two calls in blk_mq_exit_queue() If a tag set is shared across request queues (e.g. SCSI LUNs) then the block layer core keeps track of the number of active request queues in tags->active_queues. blk_mq_tag_busy() and blk_mq_tag_idle() update that atomic counter if the hctx flag BLK_MQ_F_TAG_QUEUE_SHARED is set. Make sure that blk_mq_exit_queue() calls blk_mq_tag_idle() before that flag is cleared by blk_mq_del_queue_tag_set(). Cc: Christoph Hellwig Cc: Ming Lei Cc: Hannes Reinecke Fixes: 0d2602ca30e4 ("blk-mq: improve support for shared tags maps") Signed-off-by: Bart Van Assche Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20210513171529.7977-1-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 28ef0248efba3..c86c01bfecdbe 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3286,10 +3286,12 @@ EXPORT_SYMBOL(blk_mq_init_allocated_queue); /* tags can _not_ be used after returning from blk_mq_exit_queue */ void blk_mq_exit_queue(struct request_queue *q) { - struct blk_mq_tag_set *set = q->tag_set; + struct blk_mq_tag_set *set = q->tag_set; - blk_mq_del_queue_tag_set(q); + /* Checks hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED. */ blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); + /* May clear BLK_MQ_F_TAG_QUEUE_SHARED in hctx->flags. */ + blk_mq_del_queue_tag_set(q); } static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) From 4bc2082311311892742deb2ce04bc335f85ee27a Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 13 May 2021 10:17:08 -0700 Subject: [PATCH 247/263] block/partitions/efi.c: Fix the efi_partition() kernel-doc header Fix the following kernel-doc warning: block/partitions/efi.c:685: warning: wrong kernel-doc identifier on line: * efi_partition(struct parsed_partitions *state) Cc: Alexander Viro Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20210513171708.8391-1-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/partitions/efi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/partitions/efi.c b/block/partitions/efi.c index b64bfdd4326c9..e2716792ecc13 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -682,7 +682,7 @@ static void utf16_le_to_7bit(const __le16 *in, unsigned int size, u8 *out) } /** - * efi_partition(struct parsed_partitions *state) + * efi_partition - scan for GPT partitions * @state: disk parsed partitions * * Description: called from check.c, if the disk contains GPT From 588a513d34257fdde95a9f0df0202e31998e85c6 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 14 May 2021 10:50:01 +0100 Subject: [PATCH 248/263] arm64: Fix race condition on PG_dcache_clean in __sync_icache_dcache() To ensure that instructions are observable in a new mapping, the arm64 set_pte_at() implementation cleans the D-cache and invalidates the I-cache to the PoU. As an optimisation, this is only done on executable mappings and the PG_dcache_clean page flag is set to avoid future cache maintenance on the same page. When two different processes map the same page (e.g. private executable file or shared mapping) there's a potential race on checking and setting PG_dcache_clean via set_pte_at() -> __sync_icache_dcache(). While on the fault paths the page is locked (PG_locked), mprotect() does not take the page lock. The result is that one process may see the PG_dcache_clean flag set but the I/D cache maintenance not yet performed. Avoid test_and_set_bit(PG_dcache_clean) in favour of separate test_bit() and set_bit(). In the rare event of a race, the cache maintenance is done twice. Signed-off-by: Catalin Marinas Cc: Cc: Will Deacon Cc: Steven Price Reviewed-by: Steven Price Acked-by: Will Deacon Link: https://lore.kernel.org/r/20210514095001.13236-1-catalin.marinas@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/mm/flush.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index ac485163a4a76..6d44c028d1c9e 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c @@ -55,8 +55,10 @@ void __sync_icache_dcache(pte_t pte) { struct page *page = pte_page(pte); - if (!test_and_set_bit(PG_dcache_clean, &page->flags)) + if (!test_bit(PG_dcache_clean, &page->flags)) { sync_icache_aliases(page_address(page), page_size(page)); + set_bit(PG_dcache_clean, &page->flags); + } } EXPORT_SYMBOL_GPL(__sync_icache_dcache); From 22247efd822e6d263f3c8bd327f3f769aea9b1d9 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 14 May 2021 17:27:04 -0700 Subject: [PATCH 249/263] mm/hugetlb: fix F_SEAL_FUTURE_WRITE Patch series "mm/hugetlb: Fix issues on file sealing and fork", v2. Hugh reported issue with F_SEAL_FUTURE_WRITE not applied correctly to hugetlbfs, which I can easily verify using the memfd_test program, which seems that the program is hardly run with hugetlbfs pages (as by default shmem). Meanwhile I found another probably even more severe issue on that hugetlb fork won't wr-protect child cow pages, so child can potentially write to parent private pages. Patch 2 addresses that. After this series applied, "memfd_test hugetlbfs" should start to pass. This patch (of 2): F_SEAL_FUTURE_WRITE is missing for hugetlb starting from the first day. There is a test program for that and it fails constantly. $ ./memfd_test hugetlbfs memfd-hugetlb: CREATE memfd-hugetlb: BASIC memfd-hugetlb: SEAL-WRITE memfd-hugetlb: SEAL-FUTURE-WRITE mmap() didn't fail as expected Aborted (core dumped) I think it's probably because no one is really running the hugetlbfs test. Fix it by checking FUTURE_WRITE also in hugetlbfs_file_mmap() as what we do in shmem_mmap(). Generalize a helper for that. Link: https://lkml.kernel.org/r/20210503234356.9097-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20210503234356.9097-2-peterx@redhat.com Fixes: ab3948f58ff84 ("mm/memfd: add an F_SEAL_FUTURE_WRITE seal to memfd") Signed-off-by: Peter Xu Reported-by: Hugh Dickins Reviewed-by: Mike Kravetz Cc: Joel Fernandes (Google) Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 5 +++++ include/linux/mm.h | 32 ++++++++++++++++++++++++++++++++ mm/shmem.c | 22 ++++------------------ 3 files changed, 41 insertions(+), 18 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index a2a42335e8fd2..9d9e0097c1d38 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -131,6 +131,7 @@ static void huge_pagevec_release(struct pagevec *pvec) static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); + struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); loff_t len, vma_len; int ret; struct hstate *h = hstate_file(file); @@ -146,6 +147,10 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND; vma->vm_ops = &hugetlb_vm_ops; + ret = seal_check_future_write(info->seals, vma); + if (ret) + return ret; + /* * page based offset in vm_pgoff could be sufficiently large to * overflow a loff_t when converted to byte offset. This can diff --git a/include/linux/mm.h b/include/linux/mm.h index 322ec61d0da79..c274f75efcf97 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3216,5 +3216,37 @@ void mem_dump_obj(void *object); static inline void mem_dump_obj(void *object) {} #endif +/** + * seal_check_future_write - Check for F_SEAL_FUTURE_WRITE flag and handle it + * @seals: the seals to check + * @vma: the vma to operate on + * + * Check whether F_SEAL_FUTURE_WRITE is set; if so, do proper check/handling on + * the vma flags. Return 0 if check pass, or <0 for errors. + */ +static inline int seal_check_future_write(int seals, struct vm_area_struct *vma) +{ + if (seals & F_SEAL_FUTURE_WRITE) { + /* + * New PROT_WRITE and MAP_SHARED mmaps are not allowed when + * "future write" seal active. + */ + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) + return -EPERM; + + /* + * Since an F_SEAL_FUTURE_WRITE sealed memfd can be mapped as + * MAP_SHARED and read-only, take care to not allow mprotect to + * revert protections on such mappings. Do this only for shared + * mappings. For private mappings, don't need to mask + * VM_MAYWRITE as we still want them to be COW-writable. + */ + if (vma->vm_flags & VM_SHARED) + vma->vm_flags &= ~(VM_MAYWRITE); + } + + return 0; +} + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/mm/shmem.c b/mm/shmem.c index a08cedefbfaa6..eb131b9fb1909 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2258,25 +2258,11 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user) static int shmem_mmap(struct file *file, struct vm_area_struct *vma) { struct shmem_inode_info *info = SHMEM_I(file_inode(file)); + int ret; - if (info->seals & F_SEAL_FUTURE_WRITE) { - /* - * New PROT_WRITE and MAP_SHARED mmaps are not allowed when - * "future write" seal active. - */ - if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) - return -EPERM; - - /* - * Since an F_SEAL_FUTURE_WRITE sealed memfd can be mapped as - * MAP_SHARED and read-only, take care to not allow mprotect to - * revert protections on such mappings. Do this only for shared - * mappings. For private mappings, don't need to mask - * VM_MAYWRITE as we still want them to be COW-writable. - */ - if (vma->vm_flags & VM_SHARED) - vma->vm_flags &= ~(VM_MAYWRITE); - } + ret = seal_check_future_write(info->seals, vma); + if (ret) + return ret; /* arm64 - allow memory tagging on RAM-based files */ vma->vm_flags |= VM_MTE_ALLOWED; From 84894e1c42e9f25c17f2888e0c0e1505cb727538 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 14 May 2021 17:27:07 -0700 Subject: [PATCH 250/263] mm/hugetlb: fix cow where page writtable in child When rework early cow of pinned hugetlb pages, we moved huge_ptep_get() upper but overlooked a side effect that the huge_ptep_get() will fetch the pte after wr-protection. After moving it upwards, we need explicit wr-protect of child pte or we will keep the write bit set in the child process, which could cause data corrution where the child can write to the original page directly. This issue can also be exposed by "memfd_test hugetlbfs" kselftest. Link: https://lkml.kernel.org/r/20210503234356.9097-3-peterx@redhat.com Fixes: 4eae4efa2c299 ("hugetlb: do early cow when page pinned on src mm") Signed-off-by: Peter Xu Reviewed-by: Mike Kravetz Cc: Hugh Dickins Cc: Joel Fernandes (Google) Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3db405dea3dc9..95918f410c0f8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4056,6 +4056,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, * See Documentation/vm/mmu_notifier.rst */ huge_ptep_set_wrprotect(src, addr, src_pte); + entry = huge_pte_wrprotect(entry); } page_dup_rmap(ptepage, true); From afe0c26d1968fe3bbef6a45df945bfeff774ca75 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 14 May 2021 17:27:10 -0700 Subject: [PATCH 251/263] mm, slub: move slub_debug static key enabling outside slab_mutex Paul E. McKenney reported [1] that commit 1f0723a4c0df ("mm, slub: enable slub_debug static key when creating cache with explicit debug flags") results in the lockdep complaint: ====================================================== WARNING: possible circular locking dependency detected 5.12.0+ #15 Not tainted ------------------------------------------------------ rcu_torture_sta/109 is trying to acquire lock: ffffffff96063cd0 (cpu_hotplug_lock){++++}-{0:0}, at: static_key_enable+0x9/0x20 but task is already holding lock: ffffffff96173c28 (slab_mutex){+.+.}-{3:3}, at: kmem_cache_create_usercopy+0x2d/0x250 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (slab_mutex){+.+.}-{3:3}: lock_acquire+0xb9/0x3a0 __mutex_lock+0x8d/0x920 slub_cpu_dead+0x15/0xf0 cpuhp_invoke_callback+0x17a/0x7c0 cpuhp_invoke_callback_range+0x3b/0x80 _cpu_down+0xdf/0x2a0 cpu_down+0x2c/0x50 device_offline+0x82/0xb0 remove_cpu+0x1a/0x30 torture_offline+0x80/0x140 torture_onoff+0x147/0x260 kthread+0x10a/0x140 ret_from_fork+0x22/0x30 -> #0 (cpu_hotplug_lock){++++}-{0:0}: check_prev_add+0x8f/0xbf0 __lock_acquire+0x13f0/0x1d80 lock_acquire+0xb9/0x3a0 cpus_read_lock+0x21/0xa0 static_key_enable+0x9/0x20 __kmem_cache_create+0x38d/0x430 kmem_cache_create_usercopy+0x146/0x250 kmem_cache_create+0xd/0x10 rcu_torture_stats+0x79/0x280 kthread+0x10a/0x140 ret_from_fork+0x22/0x30 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(slab_mutex); lock(cpu_hotplug_lock); lock(slab_mutex); lock(cpu_hotplug_lock); *** DEADLOCK *** 1 lock held by rcu_torture_sta/109: #0: ffffffff96173c28 (slab_mutex){+.+.}-{3:3}, at: kmem_cache_create_usercopy+0x2d/0x250 stack backtrace: CPU: 3 PID: 109 Comm: rcu_torture_sta Not tainted 5.12.0+ #15 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-1ubuntu1.1 04/01/2014 Call Trace: dump_stack+0x6d/0x89 check_noncircular+0xfe/0x110 ? lock_is_held_type+0x98/0x110 check_prev_add+0x8f/0xbf0 __lock_acquire+0x13f0/0x1d80 lock_acquire+0xb9/0x3a0 ? static_key_enable+0x9/0x20 ? mark_held_locks+0x49/0x70 cpus_read_lock+0x21/0xa0 ? static_key_enable+0x9/0x20 static_key_enable+0x9/0x20 __kmem_cache_create+0x38d/0x430 kmem_cache_create_usercopy+0x146/0x250 ? rcu_torture_stats_print+0xd0/0xd0 kmem_cache_create+0xd/0x10 rcu_torture_stats+0x79/0x280 ? rcu_torture_stats_print+0xd0/0xd0 kthread+0x10a/0x140 ? kthread_park+0x80/0x80 ret_from_fork+0x22/0x30 This is because there's one order of locking from the hotplug callbacks: lock(cpu_hotplug_lock); // from hotplug machinery itself lock(slab_mutex); // in e.g. slab_mem_going_offline_callback() And commit 1f0723a4c0df made the reverse sequence possible: lock(slab_mutex); // in kmem_cache_create_usercopy() lock(cpu_hotplug_lock); // kmem_cache_open() -> static_key_enable() The simplest fix is to move static_key_enable() to a place before slab_mutex is taken. That means kmem_cache_create_usercopy() in mm/slab_common.c which is not ideal for SLUB-specific code, but the #ifdef CONFIG_SLUB_DEBUG makes it at least self-contained and obvious. [1] https://lore.kernel.org/lkml/20210502171827.GA3670492@paulmck-ThinkPad-P17-Gen-1/ Link: https://lkml.kernel.org/r/20210504120019.26791-1-vbabka@suse.cz Fixes: 1f0723a4c0df ("mm, slub: enable slub_debug static key when creating cache with explicit debug flags") Signed-off-by: Vlastimil Babka Reported-by: Paul E. McKenney Tested-by: Paul E. McKenney Acked-by: David Rientjes Cc: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab_common.c | 10 ++++++++++ mm/slub.c | 9 --------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index f8833d3e5d47e..a4a571428c511 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -318,6 +318,16 @@ kmem_cache_create_usercopy(const char *name, const char *cache_name; int err; +#ifdef CONFIG_SLUB_DEBUG + /* + * If no slub_debug was enabled globally, the static key is not yet + * enabled by setup_slub_debug(). Enable it if the cache is being + * created with any of the debugging flags passed explicitly. + */ + if (flags & SLAB_DEBUG_FLAGS) + static_branch_enable(&slub_debug_enabled); +#endif + mutex_lock(&slab_mutex); err = kmem_cache_sanity_check(name, size); diff --git a/mm/slub.c b/mm/slub.c index feda53ae62ba4..438fa8d4c970d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3828,15 +3828,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) { -#ifdef CONFIG_SLUB_DEBUG - /* - * If no slub_debug was enabled globally, the static key is not yet - * enabled by setup_slub_debug(). Enable it if the cache is being - * created with any of the debugging flags passed explicitly. - */ - if (flags & SLAB_DEBUG_FLAGS) - static_branch_enable(&slub_debug_enabled); -#endif s->flags = kmem_cache_flags(s->size, flags, s->name); #ifdef CONFIG_SLAB_FREELIST_HARDENED s->random = get_random_long(); From eb1f065f90cdcdcc704e9e2dc678931317c69a99 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Fri, 14 May 2021 17:27:13 -0700 Subject: [PATCH 252/263] kernel/resource: fix return code check in __request_free_mem_region Splitting an earlier version of a patch that allowed calling __request_region() while holding the resource lock into a series of patches required changing the return code for the newly introduced __request_region_locked(). Unfortunately this change was not carried through to a subsequent commit 56fd94919b8b ("kernel/resource: fix locking in request_free_mem_region") in the series. This resulted in a use-after-free due to freeing the struct resource without properly releasing it. Fix this by correcting the return code check so that the struct is not freed if the request to add it was successful. Link: https://lkml.kernel.org/r/20210512073528.22334-1-apopple@nvidia.com Fixes: 56fd94919b8b ("kernel/resource: fix locking in request_free_mem_region") Signed-off-by: Alistair Popple Reported-by: kernel test robot Reviewed-by: David Hildenbrand Cc: Balbir Singh Cc: Dan Williams Cc: Daniel Vetter Cc: Greg Kroah-Hartman Cc: Jerome Glisse Cc: John Hubbard Cc: Muchun Song Cc: Oliver Sang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/resource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/resource.c b/kernel/resource.c index 028a5ab18818f..ca9f5198a01ff 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1805,7 +1805,7 @@ static struct resource *__request_free_mem_region(struct device *dev, REGION_DISJOINT) continue; - if (!__request_region_locked(res, &iomem_resource, addr, size, + if (__request_region_locked(res, &iomem_resource, addr, size, name, 0)) break; From d6e621de1fceb3b098ebf435ef7ea91ec4838a1a Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Fri, 14 May 2021 17:27:16 -0700 Subject: [PATCH 253/263] squashfs: fix divide error in calculate_skip() Sysbot has reported a "divide error" which has been identified as being caused by a corrupted file_size value within the file inode. This value has been corrupted to a much larger value than expected. Calculate_skip() is passed i_size_read(inode) >> msblk->block_log. Due to the file_size value corruption this overflows the int argument/variable in that function, leading to the divide error. This patch changes the function to use u64. This will accommodate any unexpectedly large values due to corruption. The value returned from calculate_skip() is clamped to be never more than SQUASHFS_CACHED_BLKS - 1, or 7. So file_size corruption does not lead to an unexpectedly large return result here. Link: https://lkml.kernel.org/r/20210507152618.9447-1-phillip@squashfs.org.uk Signed-off-by: Phillip Lougher Reported-by: Reported-by: Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/squashfs/file.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index 7b1128398976e..89d492916deaf 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -211,11 +211,11 @@ static long long read_indexes(struct super_block *sb, int n, * If the skip factor is limited in this way then the file will use multiple * slots. */ -static inline int calculate_skip(int blocks) +static inline int calculate_skip(u64 blocks) { - int skip = blocks / ((SQUASHFS_META_ENTRIES + 1) + u64 skip = blocks / ((SQUASHFS_META_ENTRIES + 1) * SQUASHFS_META_INDEXES); - return min(SQUASHFS_CACHED_BLKS - 1, skip + 1); + return min((u64) SQUASHFS_CACHED_BLKS - 1, skip + 1); } From 7ed9d238c7dbb1fdb63ad96a6184985151b0171c Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Fri, 14 May 2021 17:27:19 -0700 Subject: [PATCH 254/263] userfaultfd: release page in error path to avoid BUG_ON Consider the following sequence of events: 1. Userspace issues a UFFD ioctl, which ends up calling into shmem_mfill_atomic_pte(). We successfully account the blocks, we shmem_alloc_page(), but then the copy_from_user() fails. We return -ENOENT. We don't release the page we allocated. 2. Our caller detects this error code, tries the copy_from_user() after dropping the mmap_lock, and retries, calling back into shmem_mfill_atomic_pte(). 3. Meanwhile, let's say another process filled up the tmpfs being used. 4. So shmem_mfill_atomic_pte() fails to account blocks this time, and immediately returns - without releasing the page. This triggers a BUG_ON in our caller, which asserts that the page should always be consumed, unless -ENOENT is returned. To fix this, detect if we have such a "dangling" page when accounting fails, and if so, release it before returning. Link: https://lkml.kernel.org/r/20210428230858.348400-1-axelrasmussen@google.com Fixes: cb658a453b93 ("userfaultfd: shmem: avoid leaking blocks and used blocks in UFFDIO_COPY") Signed-off-by: Axel Rasmussen Reported-by: Hugh Dickins Acked-by: Hugh Dickins Reviewed-by: Peter Xu Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index eb131b9fb1909..5d46611cba8dc 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2361,8 +2361,18 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, pgoff_t offset, max_off; ret = -ENOMEM; - if (!shmem_inode_acct_block(inode, 1)) + if (!shmem_inode_acct_block(inode, 1)) { + /* + * We may have got a page, returned -ENOENT triggering a retry, + * and now we find ourselves with -ENOMEM. Release the page, to + * avoid a BUG_ON in our caller. + */ + if (unlikely(*pagep)) { + put_page(*pagep); + *pagep = NULL; + } goto out; + } if (!*pagep) { page = shmem_alloc_page(gfp, info, pgoff); From 628622904b8d229591134e44efd6608a7541eb89 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 14 May 2021 17:27:22 -0700 Subject: [PATCH 255/263] ksm: revert "use GET_KSM_PAGE_NOLOCK to get ksm page in remove_rmap_item_from_tree()" This reverts commit 3e96b6a2e9ad929a3230a22f4d64a74671a0720b. General Protection Fault in rmap_walk_ksm() under memory pressure: remove_rmap_item_from_tree() needs to take page lock, of course. Link: https://lkml.kernel.org/r/alpine.LSU.2.11.2105092253500.1127@eggly.anvils Signed-off-by: Hugh Dickins Cc: Miaohe Lin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/ksm.c b/mm/ksm.c index 6bbe314c52603..2f3aaeb34a42e 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -776,11 +776,12 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) struct page *page; stable_node = rmap_item->head; - page = get_ksm_page(stable_node, GET_KSM_PAGE_NOLOCK); + page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK); if (!page) goto out; hlist_del(&rmap_item->hlist); + unlock_page(page); put_page(page); if (!hlist_empty(&stable_node->hlist)) From 9ddb3c14afba8bc5950ed297f02d4ae05ff35cd1 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 14 May 2021 17:27:24 -0700 Subject: [PATCH 256/263] mm: fix struct page layout on 32-bit systems 32-bit architectures which expect 8-byte alignment for 8-byte integers and need 64-bit DMA addresses (arm, mips, ppc) had their struct page inadvertently expanded in 2019. When the dma_addr_t was added, it forced the alignment of the union to 8 bytes, which inserted a 4 byte gap between 'flags' and the union. Fix this by storing the dma_addr_t in one or two adjacent unsigned longs. This restores the alignment to that of an unsigned long. We always store the low bits in the first word to prevent the PageTail bit from being inadvertently set on a big endian platform. If that happened, get_user_pages_fast() racing against a page which was freed and reallocated to the page_pool could dereference a bogus compound_head(), which would be hard to trace back to this cause. Link: https://lkml.kernel.org/r/20210510153211.1504886-1-willy@infradead.org Fixes: c25fff7171be ("mm: add dma_addr_t to struct page") Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Ilias Apalodimas Acked-by: Jesper Dangaard Brouer Acked-by: Vlastimil Babka Tested-by: Matteo Croce Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 4 ++-- include/net/page_pool.h | 12 +++++++++++- net/core/page_pool.c | 12 +++++++----- 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6613b26a88946..5aacc1c10a45a 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -97,10 +97,10 @@ struct page { }; struct { /* page_pool used by netstack */ /** - * @dma_addr: might require a 64-bit value even on + * @dma_addr: might require a 64-bit value on * 32-bit architectures. */ - dma_addr_t dma_addr; + unsigned long dma_addr[2]; }; struct { /* slab, slob and slub */ union { diff --git a/include/net/page_pool.h b/include/net/page_pool.h index 6d517a37c18bf..b4b6de909c934 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -198,7 +198,17 @@ static inline void page_pool_recycle_direct(struct page_pool *pool, static inline dma_addr_t page_pool_get_dma_addr(struct page *page) { - return page->dma_addr; + dma_addr_t ret = page->dma_addr[0]; + if (sizeof(dma_addr_t) > sizeof(unsigned long)) + ret |= (dma_addr_t)page->dma_addr[1] << 16 << 16; + return ret; +} + +static inline void page_pool_set_dma_addr(struct page *page, dma_addr_t addr) +{ + page->dma_addr[0] = addr; + if (sizeof(dma_addr_t) > sizeof(unsigned long)) + page->dma_addr[1] = upper_32_bits(addr); } static inline bool is_page_pool_compiled_in(void) diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 9ec1aa9640ade..3c4c4c7a04022 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -174,8 +174,10 @@ static void page_pool_dma_sync_for_device(struct page_pool *pool, struct page *page, unsigned int dma_sync_size) { + dma_addr_t dma_addr = page_pool_get_dma_addr(page); + dma_sync_size = min(dma_sync_size, pool->p.max_len); - dma_sync_single_range_for_device(pool->p.dev, page->dma_addr, + dma_sync_single_range_for_device(pool->p.dev, dma_addr, pool->p.offset, dma_sync_size, pool->p.dma_dir); } @@ -195,7 +197,7 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page) if (dma_mapping_error(pool->p.dev, dma)) return false; - page->dma_addr = dma; + page_pool_set_dma_addr(page, dma); if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) page_pool_dma_sync_for_device(pool, page, pool->p.max_len); @@ -331,13 +333,13 @@ void page_pool_release_page(struct page_pool *pool, struct page *page) */ goto skip_dma_unmap; - dma = page->dma_addr; + dma = page_pool_get_dma_addr(page); - /* When page is unmapped, it cannot be returned our pool */ + /* When page is unmapped, it cannot be returned to our pool */ dma_unmap_page_attrs(pool->p.dev, dma, PAGE_SIZE << pool->p.order, pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC); - page->dma_addr = 0; + page_pool_set_dma_addr(page, 0); skip_dma_unmap: /* This may be the last page returned, releasing the pool, so * it is not safe to reference pool afterwards. From f649dc0e0d7b509c75570ee403723660f5b72ec7 Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Fri, 14 May 2021 17:27:27 -0700 Subject: [PATCH 257/263] kasan: fix unit tests with CONFIG_UBSAN_LOCAL_BOUNDS enabled These tests deliberately access these arrays out of bounds, which will cause the dynamic local bounds checks inserted by CONFIG_UBSAN_LOCAL_BOUNDS to fail and panic the kernel. To avoid this problem, access the arrays via volatile pointers, which will prevent the compiler from being able to determine the array bounds. These accesses use volatile pointers to char (char *volatile) rather than the more conventional pointers to volatile char (volatile char *) because we want to prevent the compiler from making inferences about the pointer itself (i.e. its array bounds), not the data that it refers to. Link: https://lkml.kernel.org/r/20210507025915.1464056-1-pcc@google.com Link: https://linux-review.googlesource.com/id/I90b1713fbfa1bf68ff895aef099ea77b98a7c3b9 Signed-off-by: Peter Collingbourne Tested-by: Alexander Potapenko Reviewed-by: Andrey Konovalov Cc: Peter Collingbourne Cc: George Popescu Cc: Elena Petrova Cc: Evgenii Stepanov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_kasan.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index dc05cfc2d12f0..cacbbbdef768d 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -654,8 +654,20 @@ static char global_array[10]; static void kasan_global_oob(struct kunit *test) { - volatile int i = 3; - char *p = &global_array[ARRAY_SIZE(global_array) + i]; + /* + * Deliberate out-of-bounds access. To prevent CONFIG_UBSAN_LOCAL_BOUNDS + * from failing here and panicing the kernel, access the array via a + * volatile pointer, which will prevent the compiler from being able to + * determine the array bounds. + * + * This access uses a volatile pointer to char (char *volatile) rather + * than the more conventional pointer to volatile char (volatile char *) + * because we want to prevent the compiler from making inferences about + * the pointer itself (i.e. its array bounds), not the data that it + * refers to. + */ + char *volatile array = global_array; + char *p = &array[ARRAY_SIZE(global_array) + 3]; /* Only generic mode instruments globals. */ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC); @@ -703,8 +715,9 @@ static void ksize_uaf(struct kunit *test) static void kasan_stack_oob(struct kunit *test) { char stack_array[10]; - volatile int i = OOB_TAG_OFF; - char *p = &stack_array[ARRAY_SIZE(stack_array) + i]; + /* See comment in kasan_global_oob. */ + char *volatile array = stack_array; + char *p = &array[ARRAY_SIZE(stack_array) + OOB_TAG_OFF]; KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_STACK); @@ -715,7 +728,9 @@ static void kasan_alloca_oob_left(struct kunit *test) { volatile int i = 10; char alloca_array[i]; - char *p = alloca_array - 1; + /* See comment in kasan_global_oob. */ + char *volatile array = alloca_array; + char *p = array - 1; /* Only generic mode instruments dynamic allocas. */ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC); @@ -728,7 +743,9 @@ static void kasan_alloca_oob_right(struct kunit *test) { volatile int i = 10; char alloca_array[i]; - char *p = alloca_array + i; + /* See comment in kasan_global_oob. */ + char *volatile array = alloca_array; + char *p = array + i; /* Only generic mode instruments dynamic allocas. */ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC); From 076171a67789ad0107de44c2964f2e46a7d0d7b8 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 14 May 2021 17:27:30 -0700 Subject: [PATCH 258/263] mm/filemap: fix readahead return types A readahead request will not allocate more memory than can be represented by a size_t, even on systems that have HIGHMEM available. Change the length functions from returning an loff_t to a size_t. Link: https://lkml.kernel.org/r/20210510201201.1558972-1-willy@infradead.org Fixes: 32c0a6bcaa1f57 ("btrfs: add and use readahead_batch_length") Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Darrick J. Wong Reported-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/iomap/buffered-io.c | 4 ++-- include/linux/pagemap.h | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index f2cd2034a87bb..9023717c5188b 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -394,7 +394,7 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops) { struct inode *inode = rac->mapping->host; loff_t pos = readahead_pos(rac); - loff_t length = readahead_length(rac); + size_t length = readahead_length(rac); struct iomap_readpage_ctx ctx = { .rac = rac, }; @@ -402,7 +402,7 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops) trace_iomap_readahead(inode, readahead_count(rac)); while (length > 0) { - loff_t ret = iomap_apply(inode, pos, length, 0, ops, + ssize_t ret = iomap_apply(inode, pos, length, 0, ops, &ctx, iomap_readahead_actor); if (ret <= 0) { WARN_ON_ONCE(ret == 0); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index a4bd41128bf31..e89df447fae32 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -997,9 +997,9 @@ static inline loff_t readahead_pos(struct readahead_control *rac) * readahead_length - The number of bytes in this readahead request. * @rac: The readahead request. */ -static inline loff_t readahead_length(struct readahead_control *rac) +static inline size_t readahead_length(struct readahead_control *rac) { - return (loff_t)rac->_nr_pages * PAGE_SIZE; + return rac->_nr_pages * PAGE_SIZE; } /** @@ -1024,7 +1024,7 @@ static inline unsigned int readahead_count(struct readahead_control *rac) * readahead_batch_length - The number of bytes in the current batch. * @rac: The readahead request. */ -static inline loff_t readahead_batch_length(struct readahead_control *rac) +static inline size_t readahead_batch_length(struct readahead_control *rac) { return rac->_batch_count * PAGE_SIZE; } From c3187cf32216313fb316084efac4dab3a8459b1d Mon Sep 17 00:00:00 2001 From: Jouni Roivas Date: Fri, 14 May 2021 17:27:33 -0700 Subject: [PATCH 259/263] hfsplus: prevent corruption in shrinking truncate I believe there are some issues introduced by commit 31651c607151 ("hfsplus: avoid deadlock on file truncation") HFS+ has extent records which always contains 8 extents. In case the first extent record in catalog file gets full, new ones are allocated from extents overflow file. In case shrinking truncate happens to middle of an extent record which locates in extents overflow file, the logic in hfsplus_file_truncate() was changed so that call to hfs_brec_remove() is not guarded any more. Right action would be just freeing the extents that exceed the new size inside extent record by calling hfsplus_free_extents(), and then check if the whole extent record should be removed. However since the guard (blk_cnt > start) is now after the call to hfs_brec_remove(), this has unfortunate effect that the last matching extent record is removed unconditionally. To reproduce this issue, create a file which has at least 10 extents, and then perform shrinking truncate into middle of the last extent record, so that the number of remaining extents is not under or divisible by 8. This causes the last extent record (8 extents) to be removed totally instead of truncating into middle of it. Thus this causes corruption, and lost data. Fix for this is simply checking if the new truncated end is below the start of this extent record, making it safe to remove the full extent record. However call to hfs_brec_remove() can't be moved to it's previous place since we're dropping ->tree_lock and it can cause a race condition and the cached info being invalidated possibly corrupting the node data. Another issue is related to this one. When entering into the block (blk_cnt > start) we are not holding the ->tree_lock. We break out from the loop not holding the lock, but hfs_find_exit() does unlock it. Not sure if it's possible for someone else to take the lock under our feet, but it can cause hard to debug errors and premature unlocking. Even if there's no real risk of it, the locking should still always be kept in balance. Thus taking the lock now just before the check. Link: https://lkml.kernel.org/r/20210429165139.3082828-1-jouni.roivas@tuxera.com Fixes: 31651c607151f ("hfsplus: avoid deadlock on file truncation") Signed-off-by: Jouni Roivas Reviewed-by: Anton Altaparmakov Cc: Anatoly Trosinenko Cc: Viacheslav Dubeyko Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hfsplus/extents.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index a930ddd156819..7054a542689f9 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -598,13 +598,15 @@ void hfsplus_file_truncate(struct inode *inode) res = __hfsplus_ext_cache_extent(&fd, inode, alloc_cnt); if (res) break; - hfs_brec_remove(&fd); - mutex_unlock(&fd.tree->tree_lock); start = hip->cached_start; + if (blk_cnt <= start) + hfs_brec_remove(&fd); + mutex_unlock(&fd.tree->tree_lock); hfsplus_free_extents(sb, hip->cached_extents, alloc_cnt - start, alloc_cnt - blk_cnt); hfsplus_dump_extent(hip->cached_extents); + mutex_lock(&fd.tree->tree_lock); if (blk_cnt > start) { hip->extent_state |= HFSPLUS_EXT_DIRTY; break; @@ -612,7 +614,6 @@ void hfsplus_file_truncate(struct inode *inode) alloc_cnt = start; hip->cached_start = hip->cached_blocks = 0; hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW); - mutex_lock(&fd.tree->tree_lock); } hfs_find_exit(&fd); From f4d3f25aced3b493e57fd4109e2bc86f0831b23e Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Fri, 14 May 2021 17:27:36 -0700 Subject: [PATCH 260/263] docs: admin-guide: update description for kernel.modprobe sysctl When I added CONFIG_MODPROBE_PATH, I neglected to update Documentation/. It's still true that this defaults to /sbin/modprobe, but now via a level of indirection. So document that the kernel might have been built with something other than /sbin/modprobe as the initial value. Link: https://lkml.kernel.org/r/20210420125324.1246826-1-linux@rasmusvillemoes.dk Fixes: 17652f4240f7a ("modules: add CONFIG_MODPROBE_PATH") Signed-off-by: Rasmus Villemoes Cc: Jonathan Corbet Cc: Greg Kroah-Hartman Cc: Jessica Yu Cc: Luis Chamberlain Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/sysctl/kernel.rst | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 1d56a6b73a4e9..7ca8df5451d45 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -483,10 +483,11 @@ modprobe ======== The full path to the usermode helper for autoloading kernel modules, -by default "/sbin/modprobe". This binary is executed when the kernel -requests a module. For example, if userspace passes an unknown -filesystem type to mount(), then the kernel will automatically request -the corresponding filesystem module by executing this usermode helper. +by default ``CONFIG_MODPROBE_PATH``, which in turn defaults to +"/sbin/modprobe". This binary is executed when the kernel requests a +module. For example, if userspace passes an unknown filesystem type +to mount(), then the kernel will automatically request the +corresponding filesystem module by executing this usermode helper. This usermode helper should insert the needed module into the kernel. This sysctl only affects module autoloading. It has no effect on the From 86d0c164272536c732853e19391de5159f860701 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 14 May 2021 17:27:39 -0700 Subject: [PATCH 261/263] mm/ioremap: fix iomap_max_page_shift iomap_max_page_shift is expected to contain a page shift, so it can't be a 'bool', has to be an 'unsigned int' And fix the default values: P4D_SHIFT is when huge iomap is allowed. However, on some architectures (eg: powerpc book3s/64), P4D_SHIFT is not a constant so it can't be used to initialise a static variable. So, initialise iomap_max_page_shift with a maximum shift supported by the architecture, it is gated by P4D_SHIFT in vmap_try_huge_p4d() anyway. Link: https://lkml.kernel.org/r/ad2d366015794a9f21320dcbdd0a8eb98979e9df.1620898113.git.christophe.leroy@csgroup.eu Fixes: bbc180a5adb0 ("mm: HUGE_VMAP arch support cleanup") Signed-off-by: Christophe Leroy Reviewed-by: Nicholas Piggin Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ioremap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/ioremap.c b/mm/ioremap.c index d1dcc7e744acf..8ee0136f8cb08 100644 --- a/mm/ioremap.c +++ b/mm/ioremap.c @@ -16,16 +16,16 @@ #include "pgalloc-track.h" #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP -static bool __ro_after_init iomap_max_page_shift = PAGE_SHIFT; +static unsigned int __ro_after_init iomap_max_page_shift = BITS_PER_LONG - 1; static int __init set_nohugeiomap(char *str) { - iomap_max_page_shift = P4D_SHIFT; + iomap_max_page_shift = PAGE_SHIFT; return 0; } early_param("nohugeiomap", set_nohugeiomap); #else /* CONFIG_HAVE_ARCH_HUGE_VMAP */ -static const bool iomap_max_page_shift = PAGE_SHIFT; +static const unsigned int iomap_max_page_shift = PAGE_SHIFT; #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ int ioremap_page_range(unsigned long addr, From ffb324e6f874121f7dce5bdae5e05d02baae7269 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sat, 15 May 2021 03:00:37 +0000 Subject: [PATCH 262/263] tty: vt: always invoke vc->vc_sw->con_resize callback syzbot is reporting OOB write at vga16fb_imageblit() [1], for resize_screen() from ioctl(VT_RESIZE) returns 0 without checking whether requested rows/columns fit the amount of memory reserved for the graphical screen if current mode is KD_GRAPHICS. ---------- #include #include #include #include #include #include int main(int argc, char *argv[]) { const int fd = open("/dev/char/4:1", O_RDWR); struct vt_sizes vt = { 0x4100, 2 }; ioctl(fd, KDSETMODE, KD_GRAPHICS); ioctl(fd, VT_RESIZE, &vt); ioctl(fd, KDSETMODE, KD_TEXT); return 0; } ---------- Allow framebuffer drivers to return -EINVAL, by moving vc->vc_mode != KD_GRAPHICS check from resize_screen() to fbcon_resize(). Link: https://syzkaller.appspot.com/bug?extid=1f29e126cf461c4de3b3 [1] Reported-by: syzbot Suggested-by: Linus Torvalds Signed-off-by: Tetsuo Handa Tested-by: syzbot Signed-off-by: Linus Torvalds --- drivers/tty/vt/vt.c | 2 +- drivers/video/fbdev/core/fbcon.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c index 01645e87b3d5c..fa1548d4f94be 100644 --- a/drivers/tty/vt/vt.c +++ b/drivers/tty/vt/vt.c @@ -1171,7 +1171,7 @@ static inline int resize_screen(struct vc_data *vc, int width, int height, /* Resizes the resolution of the display adapater */ int err = 0; - if (vc->vc_mode != KD_GRAPHICS && vc->vc_sw->con_resize) + if (vc->vc_sw->con_resize) err = vc->vc_sw->con_resize(vc, width, height, user); return err; diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index 3406067985b1f..22bb3892f6bd1 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -2019,7 +2019,7 @@ static int fbcon_resize(struct vc_data *vc, unsigned int width, return -EINVAL; pr_debug("resize now %ix%i\n", var.xres, var.yres); - if (con_is_visible(vc)) { + if (con_is_visible(vc) && vc->vc_mode == KD_TEXT) { var.activate = FB_ACTIVATE_NOW | FB_ACTIVATE_FORCE; fb_set_var(info, &var); From d07f6ca923ea0927a1024dfccafc5b53b61cfecc Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 16 May 2021 15:27:44 -0700 Subject: [PATCH 263/263] Linux 5.13-rc2 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 53d09c414635c..0ed7e061c8e9e 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 13 SUBLEVEL = 0 -EXTRAVERSION = -rc1 +EXTRAVERSION = -rc2 NAME = Frozen Wasteland # *DOCUMENTATION*