From d525a5b8cf39791b47a3d61b36dcb43e1d6fbde8 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 17 Jul 2023 15:11:57 -0300 Subject: [PATCH 01/26] iommufd: Move isolated msi enforcement to iommufd_device_bind() With the recent rework this no longer needs to be done at domain attachment time, we know if the device is usable by iommufd when we bind it. The value of msi_device_has_isolated_msi() is not allowed to change while a driver is bound. Link: https://lore.kernel.org/r/1-v8-6659224517ea+532-iommufd_alloc_jgg@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 38 ++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 59fec5783eb94..d4cc394263097 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -60,6 +60,26 @@ struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx, if (!group) return ERR_PTR(-ENODEV); + /* + * For historical compat with VFIO the insecure interrupt path is + * allowed if the module parameter is set. Secure/Isolated means that a + * MemWr operation from the device (eg a simple DMA) cannot trigger an + * interrupt outside this iommufd context. + */ + if (!iommufd_selftest_is_mock_dev(dev) && + !iommu_group_has_isolated_msi(group)) { + if (!allow_unsafe_interrupts) { + rc = -EPERM; + goto out_group_put; + } + + dev_warn( + dev, + "MSI interrupts are not secure, they cannot be isolated by the platform. " + "Check that platform features like interrupt remapping are enabled. " + "Use the \"allow_unsafe_interrupts\" module parameter to override\n"); + } + rc = iommu_device_claim_dma_owner(dev, ictx); if (rc) goto out_group_put; @@ -188,24 +208,6 @@ static int iommufd_device_setup_msi(struct iommufd_device *idev, */ hwpt->msi_cookie = true; } - - /* - * For historical compat with VFIO the insecure interrupt path is - * allowed if the module parameter is set. Insecure means that a MemWr - * operation from the device (eg a simple DMA) cannot trigger an - * interrupt outside this iommufd context. - */ - if (!iommufd_selftest_is_mock_dev(idev->dev) && - !iommu_group_has_isolated_msi(idev->group)) { - if (!allow_unsafe_interrupts) - return -EPERM; - - dev_warn( - idev->dev, - "MSI interrupts are not secure, they cannot be isolated by the platform. " - "Check that platform features like interrupt remapping are enabled. " - "Use the \"allow_unsafe_interrupts\" module parameter to override\n"); - } return 0; } From 3a3329a7f14a7a0a8c30a12c7ed9f1f77f8efaa1 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 17 Jul 2023 15:11:58 -0300 Subject: [PATCH 02/26] iommufd: Add iommufd_group When the hwpt to device attachment is fairly static we could get away with the simple approach of keeping track of the groups via a device list. But with replace this is infeasible. Add an automatically managed struct that is 1:1 with the iommu_group per-ictx so we can store the necessary tracking information there. Link: https://lore.kernel.org/r/2-v8-6659224517ea+532-iommufd_alloc_jgg@nvidia.com Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 144 +++++++++++++++++++++--- drivers/iommu/iommufd/iommufd_private.h | 9 +- drivers/iommu/iommufd/main.c | 2 + 3 files changed, 137 insertions(+), 18 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index d4cc394263097..dbd517d97df8a 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -15,13 +15,121 @@ MODULE_PARM_DESC( "Allow IOMMUFD to bind to devices even if the platform cannot isolate " "the MSI interrupt window. Enabling this is a security weakness."); +static void iommufd_group_release(struct kref *kref) +{ + struct iommufd_group *igroup = + container_of(kref, struct iommufd_group, ref); + + xa_cmpxchg(&igroup->ictx->groups, iommu_group_id(igroup->group), igroup, + NULL, GFP_KERNEL); + iommu_group_put(igroup->group); + kfree(igroup); +} + +static void iommufd_put_group(struct iommufd_group *group) +{ + kref_put(&group->ref, iommufd_group_release); +} + +static bool iommufd_group_try_get(struct iommufd_group *igroup, + struct iommu_group *group) +{ + if (!igroup) + return false; + /* + * group ID's cannot be re-used until the group is put back which does + * not happen if we could get an igroup pointer under the xa_lock. + */ + if (WARN_ON(igroup->group != group)) + return false; + return kref_get_unless_zero(&igroup->ref); +} + +/* + * iommufd needs to store some more data for each iommu_group, we keep a + * parallel xarray indexed by iommu_group id to hold this instead of putting it + * in the core structure. To keep things simple the iommufd_group memory is + * unique within the iommufd_ctx. This makes it easy to check there are no + * memory leaks. + */ +static struct iommufd_group *iommufd_get_group(struct iommufd_ctx *ictx, + struct device *dev) +{ + struct iommufd_group *new_igroup; + struct iommufd_group *cur_igroup; + struct iommufd_group *igroup; + struct iommu_group *group; + unsigned int id; + + group = iommu_group_get(dev); + if (!group) + return ERR_PTR(-ENODEV); + + id = iommu_group_id(group); + + xa_lock(&ictx->groups); + igroup = xa_load(&ictx->groups, id); + if (iommufd_group_try_get(igroup, group)) { + xa_unlock(&ictx->groups); + iommu_group_put(group); + return igroup; + } + xa_unlock(&ictx->groups); + + new_igroup = kzalloc(sizeof(*new_igroup), GFP_KERNEL); + if (!new_igroup) { + iommu_group_put(group); + return ERR_PTR(-ENOMEM); + } + + kref_init(&new_igroup->ref); + /* group reference moves into new_igroup */ + new_igroup->group = group; + + /* + * The ictx is not additionally refcounted here becase all objects using + * an igroup must put it before their destroy completes. + */ + new_igroup->ictx = ictx; + + /* + * We dropped the lock so igroup is invalid. NULL is a safe and likely + * value to assume for the xa_cmpxchg algorithm. + */ + cur_igroup = NULL; + xa_lock(&ictx->groups); + while (true) { + igroup = __xa_cmpxchg(&ictx->groups, id, cur_igroup, new_igroup, + GFP_KERNEL); + if (xa_is_err(igroup)) { + xa_unlock(&ictx->groups); + iommufd_put_group(new_igroup); + return ERR_PTR(xa_err(igroup)); + } + + /* new_group was successfully installed */ + if (cur_igroup == igroup) { + xa_unlock(&ictx->groups); + return new_igroup; + } + + /* Check again if the current group is any good */ + if (iommufd_group_try_get(igroup, group)) { + xa_unlock(&ictx->groups); + iommufd_put_group(new_igroup); + return igroup; + } + cur_igroup = igroup; + } +} + void iommufd_device_destroy(struct iommufd_object *obj) { struct iommufd_device *idev = container_of(obj, struct iommufd_device, obj); iommu_device_release_dma_owner(idev->dev); - iommu_group_put(idev->group); + iommufd_put_group(idev->igroup); if (!iommufd_selftest_is_mock_dev(idev->dev)) iommufd_ctx_put(idev->ictx); } @@ -46,7 +154,7 @@ struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx, struct device *dev, u32 *id) { struct iommufd_device *idev; - struct iommu_group *group; + struct iommufd_group *igroup; int rc; /* @@ -56,9 +164,9 @@ struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx, if (!device_iommu_capable(dev, IOMMU_CAP_CACHE_COHERENCY)) return ERR_PTR(-EINVAL); - group = iommu_group_get(dev); - if (!group) - return ERR_PTR(-ENODEV); + igroup = iommufd_get_group(ictx, dev); + if (IS_ERR(igroup)) + return ERR_CAST(igroup); /* * For historical compat with VFIO the insecure interrupt path is @@ -67,7 +175,7 @@ struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx, * interrupt outside this iommufd context. */ if (!iommufd_selftest_is_mock_dev(dev) && - !iommu_group_has_isolated_msi(group)) { + !iommu_group_has_isolated_msi(igroup->group)) { if (!allow_unsafe_interrupts) { rc = -EPERM; goto out_group_put; @@ -97,8 +205,8 @@ struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx, device_iommu_capable(dev, IOMMU_CAP_ENFORCE_CACHE_COHERENCY); /* The calling driver is a user until iommufd_device_unbind() */ refcount_inc(&idev->obj.users); - /* group refcount moves into iommufd_device */ - idev->group = group; + /* igroup refcount moves into iommufd_device */ + idev->igroup = igroup; /* * If the caller fails after this success it must call @@ -113,7 +221,7 @@ struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx, out_release_owner: iommu_device_release_dma_owner(dev); out_group_put: - iommu_group_put(group); + iommufd_put_group(igroup); return ERR_PTR(rc); } EXPORT_SYMBOL_NS_GPL(iommufd_device_bind, IOMMUFD); @@ -138,7 +246,8 @@ bool iommufd_ctx_has_group(struct iommufd_ctx *ictx, struct iommu_group *group) xa_lock(&ictx->objects); xa_for_each(&ictx->objects, index, obj) { if (obj->type == IOMMUFD_OBJ_DEVICE && - container_of(obj, struct iommufd_device, obj)->group == group) { + container_of(obj, struct iommufd_device, obj) + ->igroup->group == group) { xa_unlock(&ictx->objects); return true; } @@ -212,14 +321,14 @@ static int iommufd_device_setup_msi(struct iommufd_device *idev, } static bool iommufd_hw_pagetable_has_group(struct iommufd_hw_pagetable *hwpt, - struct iommu_group *group) + struct iommufd_group *igroup) { struct iommufd_device *cur_dev; lockdep_assert_held(&hwpt->devices_lock); list_for_each_entry(cur_dev, &hwpt->devices, devices_item) - if (cur_dev->group == group) + if (cur_dev->igroup->group == igroup->group) return true; return false; } @@ -253,7 +362,8 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, } rc = iopt_table_enforce_group_resv_regions(&hwpt->ioas->iopt, idev->dev, - idev->group, &sw_msi_start); + idev->igroup->group, + &sw_msi_start); if (rc) return rc; @@ -265,8 +375,8 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, * FIXME: Hack around missing a device-centric iommu api, only attach to * the group once for the first device that is in the group. */ - if (!iommufd_hw_pagetable_has_group(hwpt, idev->group)) { - rc = iommu_attach_group(hwpt->domain, idev->group); + if (!iommufd_hw_pagetable_has_group(hwpt, idev->igroup)) { + rc = iommu_attach_group(hwpt->domain, idev->igroup->group); if (rc) goto err_unresv; } @@ -279,8 +389,8 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, void iommufd_hw_pagetable_detach(struct iommufd_hw_pagetable *hwpt, struct iommufd_device *idev) { - if (!iommufd_hw_pagetable_has_group(hwpt, idev->group)) - iommu_detach_group(hwpt->domain, idev->group); + if (!iommufd_hw_pagetable_has_group(hwpt, idev->igroup)) + iommu_detach_group(hwpt->domain, idev->igroup->group); iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev); } diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 3dcaf86aab97b..285ac4db8b45b 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -17,6 +17,7 @@ struct iommufd_device; struct iommufd_ctx { struct file *file; struct xarray objects; + struct xarray groups; u8 account_mode; /* Compatibility with VFIO no iommu */ @@ -262,6 +263,12 @@ void iommufd_hw_pagetable_detach(struct iommufd_hw_pagetable *hwpt, struct iommufd_device *idev); void iommufd_hw_pagetable_destroy(struct iommufd_object *obj); +struct iommufd_group { + struct kref ref; + struct iommufd_ctx *ictx; + struct iommu_group *group; +}; + /* * A iommufd_device object represents the binding relationship between a * consuming driver and the iommufd. These objects are created/destroyed by @@ -270,12 +277,12 @@ void iommufd_hw_pagetable_destroy(struct iommufd_object *obj); struct iommufd_device { struct iommufd_object obj; struct iommufd_ctx *ictx; + struct iommufd_group *igroup; struct iommufd_hw_pagetable *hwpt; /* Head at iommufd_hw_pagetable::devices */ struct list_head devices_item; /* always the physical device */ struct device *dev; - struct iommu_group *group; bool enforce_cache_coherency; }; diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index 4bbb20dff4300..34fefc0a78336 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -183,6 +183,7 @@ static int iommufd_fops_open(struct inode *inode, struct file *filp) } xa_init_flags(&ictx->objects, XA_FLAGS_ALLOC1 | XA_FLAGS_ACCOUNT); + xa_init(&ictx->groups); ictx->file = filp; filp->private_data = ictx; return 0; @@ -218,6 +219,7 @@ static int iommufd_fops_release(struct inode *inode, struct file *filp) if (WARN_ON(!destroyed)) break; } + WARN_ON(!xa_empty(&ictx->groups)); kfree(ictx); return 0; } From 91a2e17e243f70e0aec29facf44946439fce9195 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 17 Jul 2023 15:11:59 -0300 Subject: [PATCH 03/26] iommufd: Replace the hwpt->devices list with iommufd_group The devices list was used as a simple way to avoid having per-group information. Now that this seems to be unavoidable, just commit to per-group information fully and remove the devices list from the HWPT. The iommufd_group stores the currently assigned HWPT for the entire group and we can manage the per-device attach/detach with a list in the iommufd_group. For destruction the flow is organized to make the following patches easier, the actual call to iommufd_object_destroy_user() is done at the top of the call chain without holding any locks. The HWPT to be destroyed is returned out from the locked region to make this possible. Later patches create locking that requires this. Link: https://lore.kernel.org/r/3-v8-6659224517ea+532-iommufd_alloc_jgg@nvidia.com Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 100 +++++++++++------------- drivers/iommu/iommufd/hw_pagetable.c | 22 +----- drivers/iommu/iommufd/iommufd_private.h | 13 ++- 3 files changed, 54 insertions(+), 81 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index dbd517d97df8a..dbbfd50c0fce7 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -20,9 +20,12 @@ static void iommufd_group_release(struct kref *kref) struct iommufd_group *igroup = container_of(kref, struct iommufd_group, ref); + WARN_ON(igroup->hwpt || !list_empty(&igroup->device_list)); + xa_cmpxchg(&igroup->ictx->groups, iommu_group_id(igroup->group), igroup, NULL, GFP_KERNEL); iommu_group_put(igroup->group); + mutex_destroy(&igroup->lock); kfree(igroup); } @@ -83,6 +86,8 @@ static struct iommufd_group *iommufd_get_group(struct iommufd_ctx *ictx, } kref_init(&new_igroup->ref); + mutex_init(&new_igroup->lock); + INIT_LIST_HEAD(&new_igroup->device_list); /* group reference moves into new_igroup */ new_igroup->group = group; @@ -320,29 +325,18 @@ static int iommufd_device_setup_msi(struct iommufd_device *idev, return 0; } -static bool iommufd_hw_pagetable_has_group(struct iommufd_hw_pagetable *hwpt, - struct iommufd_group *igroup) -{ - struct iommufd_device *cur_dev; - - lockdep_assert_held(&hwpt->devices_lock); - - list_for_each_entry(cur_dev, &hwpt->devices, devices_item) - if (cur_dev->igroup->group == igroup->group) - return true; - return false; -} - int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, struct iommufd_device *idev) { phys_addr_t sw_msi_start = PHYS_ADDR_MAX; int rc; - lockdep_assert_held(&hwpt->devices_lock); + mutex_lock(&idev->igroup->lock); - if (WARN_ON(idev->hwpt)) - return -EINVAL; + if (idev->igroup->hwpt != NULL && idev->igroup->hwpt != hwpt) { + rc = -EINVAL; + goto err_unlock; + } /* * Try to upgrade the domain we have, it is an iommu driver bug to @@ -356,8 +350,9 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, hwpt->domain->ops->enforce_cache_coherency( hwpt->domain); if (!hwpt->enforce_cache_coherency) { - WARN_ON(list_empty(&hwpt->devices)); - return -EINVAL; + WARN_ON(list_empty(&idev->igroup->device_list)); + rc = -EINVAL; + goto err_unlock; } } @@ -365,51 +360,52 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, idev->igroup->group, &sw_msi_start); if (rc) - return rc; + goto err_unlock; rc = iommufd_device_setup_msi(idev, hwpt, sw_msi_start); if (rc) goto err_unresv; /* - * FIXME: Hack around missing a device-centric iommu api, only attach to - * the group once for the first device that is in the group. + * Only attach to the group once for the first device that is in the + * group. All the other devices will follow this attachment. The user + * should attach every device individually to the hwpt as the per-device + * reserved regions are only updated during individual device + * attachment. */ - if (!iommufd_hw_pagetable_has_group(hwpt, idev->igroup)) { + if (list_empty(&idev->igroup->device_list)) { rc = iommu_attach_group(hwpt->domain, idev->igroup->group); if (rc) goto err_unresv; + idev->igroup->hwpt = hwpt; } + refcount_inc(&hwpt->obj.users); + list_add_tail(&idev->group_item, &idev->igroup->device_list); + mutex_unlock(&idev->igroup->lock); return 0; err_unresv: iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev); +err_unlock: + mutex_unlock(&idev->igroup->lock); return rc; } -void iommufd_hw_pagetable_detach(struct iommufd_hw_pagetable *hwpt, - struct iommufd_device *idev) +struct iommufd_hw_pagetable * +iommufd_hw_pagetable_detach(struct iommufd_device *idev) { - if (!iommufd_hw_pagetable_has_group(hwpt, idev->igroup)) + struct iommufd_hw_pagetable *hwpt = idev->igroup->hwpt; + + mutex_lock(&idev->igroup->lock); + list_del(&idev->group_item); + if (list_empty(&idev->igroup->device_list)) { iommu_detach_group(hwpt->domain, idev->igroup->group); + idev->igroup->hwpt = NULL; + } iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev); -} - -static int iommufd_device_do_attach(struct iommufd_device *idev, - struct iommufd_hw_pagetable *hwpt) -{ - int rc; - - mutex_lock(&hwpt->devices_lock); - rc = iommufd_hw_pagetable_attach(hwpt, idev); - if (rc) - goto out_unlock; + mutex_unlock(&idev->igroup->lock); - idev->hwpt = hwpt; - refcount_inc(&hwpt->obj.users); - list_add(&idev->devices_item, &hwpt->devices); -out_unlock: - mutex_unlock(&hwpt->devices_lock); - return rc; + /* Caller must destroy hwpt */ + return hwpt; } /* @@ -418,7 +414,7 @@ static int iommufd_device_do_attach(struct iommufd_device *idev, * Automatic domain selection will never pick a manually created domain. */ static int iommufd_device_auto_get_domain(struct iommufd_device *idev, - struct iommufd_ioas *ioas) + struct iommufd_ioas *ioas, u32 *pt_id) { struct iommufd_hw_pagetable *hwpt; int rc; @@ -435,7 +431,7 @@ static int iommufd_device_auto_get_domain(struct iommufd_device *idev, if (!iommufd_lock_obj(&hwpt->obj)) continue; - rc = iommufd_device_do_attach(idev, hwpt); + rc = iommufd_hw_pagetable_attach(hwpt, idev); iommufd_put_object(&hwpt->obj); /* @@ -445,6 +441,7 @@ static int iommufd_device_auto_get_domain(struct iommufd_device *idev, */ if (rc == -EINVAL) continue; + *pt_id = hwpt->obj.id; goto out_unlock; } @@ -454,6 +451,7 @@ static int iommufd_device_auto_get_domain(struct iommufd_device *idev, goto out_unlock; } hwpt->auto_domain = true; + *pt_id = hwpt->obj.id; mutex_unlock(&ioas->mutex); iommufd_object_finalize(idev->ictx, &hwpt->obj); @@ -489,7 +487,7 @@ int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id) struct iommufd_hw_pagetable *hwpt = container_of(pt_obj, struct iommufd_hw_pagetable, obj); - rc = iommufd_device_do_attach(idev, hwpt); + rc = iommufd_hw_pagetable_attach(hwpt, idev); if (rc) goto out_put_pt_obj; break; @@ -498,7 +496,7 @@ int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id) struct iommufd_ioas *ioas = container_of(pt_obj, struct iommufd_ioas, obj); - rc = iommufd_device_auto_get_domain(idev, ioas); + rc = iommufd_device_auto_get_domain(idev, ioas, pt_id); if (rc) goto out_put_pt_obj; break; @@ -509,7 +507,6 @@ int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id) } refcount_inc(&idev->obj.users); - *pt_id = idev->hwpt->obj.id; rc = 0; out_put_pt_obj: @@ -527,14 +524,9 @@ EXPORT_SYMBOL_NS_GPL(iommufd_device_attach, IOMMUFD); */ void iommufd_device_detach(struct iommufd_device *idev) { - struct iommufd_hw_pagetable *hwpt = idev->hwpt; - - mutex_lock(&hwpt->devices_lock); - list_del(&idev->devices_item); - idev->hwpt = NULL; - iommufd_hw_pagetable_detach(hwpt, idev); - mutex_unlock(&hwpt->devices_lock); + struct iommufd_hw_pagetable *hwpt; + hwpt = iommufd_hw_pagetable_detach(idev); if (hwpt->auto_domain) iommufd_object_destroy_user(idev->ictx, &hwpt->obj); else diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 6cdb6749d359f..bdb76cdb1dc34 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -11,8 +11,6 @@ void iommufd_hw_pagetable_destroy(struct iommufd_object *obj) struct iommufd_hw_pagetable *hwpt = container_of(obj, struct iommufd_hw_pagetable, obj); - WARN_ON(!list_empty(&hwpt->devices)); - if (!list_empty(&hwpt->hwpt_item)) { mutex_lock(&hwpt->ioas->mutex); list_del(&hwpt->hwpt_item); @@ -25,7 +23,6 @@ void iommufd_hw_pagetable_destroy(struct iommufd_object *obj) iommu_domain_free(hwpt->domain); refcount_dec(&hwpt->ioas->obj.users); - mutex_destroy(&hwpt->devices_lock); } /** @@ -52,9 +49,7 @@ iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, if (IS_ERR(hwpt)) return hwpt; - INIT_LIST_HEAD(&hwpt->devices); INIT_LIST_HEAD(&hwpt->hwpt_item); - mutex_init(&hwpt->devices_lock); /* Pairs with iommufd_hw_pagetable_destroy() */ refcount_inc(&ioas->obj.users); hwpt->ioas = ioas; @@ -65,8 +60,6 @@ iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, goto out_abort; } - mutex_lock(&hwpt->devices_lock); - /* * immediate_attach exists only to accommodate iommu drivers that cannot * directly allocate a domain. These drivers do not finish creating the @@ -76,29 +69,18 @@ iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, if (immediate_attach) { rc = iommufd_hw_pagetable_attach(hwpt, idev); if (rc) - goto out_unlock; + goto out_abort; } rc = iopt_table_add_domain(&hwpt->ioas->iopt, hwpt->domain); if (rc) goto out_detach; list_add_tail(&hwpt->hwpt_item, &hwpt->ioas->hwpt_list); - - if (immediate_attach) { - /* See iommufd_device_do_attach() */ - refcount_inc(&hwpt->obj.users); - idev->hwpt = hwpt; - list_add(&idev->devices_item, &hwpt->devices); - } - - mutex_unlock(&hwpt->devices_lock); return hwpt; out_detach: if (immediate_attach) - iommufd_hw_pagetable_detach(hwpt, idev); -out_unlock: - mutex_unlock(&hwpt->devices_lock); + iommufd_hw_pagetable_detach(idev); out_abort: iommufd_object_abort_and_destroy(ictx, &hwpt->obj); return ERR_PTR(rc); diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 285ac4db8b45b..2d3fd26729392 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -250,8 +250,6 @@ struct iommufd_hw_pagetable { bool msi_cookie : 1; /* Head at iommufd_ioas::hwpt_list */ struct list_head hwpt_item; - struct mutex devices_lock; - struct list_head devices; }; struct iommufd_hw_pagetable * @@ -259,14 +257,17 @@ iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, struct iommufd_device *idev, bool immediate_attach); int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, struct iommufd_device *idev); -void iommufd_hw_pagetable_detach(struct iommufd_hw_pagetable *hwpt, - struct iommufd_device *idev); +struct iommufd_hw_pagetable * +iommufd_hw_pagetable_detach(struct iommufd_device *idev); void iommufd_hw_pagetable_destroy(struct iommufd_object *obj); struct iommufd_group { struct kref ref; + struct mutex lock; struct iommufd_ctx *ictx; struct iommu_group *group; + struct iommufd_hw_pagetable *hwpt; + struct list_head device_list; }; /* @@ -278,9 +279,7 @@ struct iommufd_device { struct iommufd_object obj; struct iommufd_ctx *ictx; struct iommufd_group *igroup; - struct iommufd_hw_pagetable *hwpt; - /* Head at iommufd_hw_pagetable::devices */ - struct list_head devices_item; + struct list_head group_item; /* always the physical device */ struct device *dev; bool enforce_cache_coherency; From 8d0e2e9d93d2b25b62c7fb6faf8c3cc31c6c6626 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 17 Jul 2023 15:12:00 -0300 Subject: [PATCH 04/26] iommu: Export iommu_get_resv_regions() iommufd wants to use this in the next patch. For some reason the iommu_put_resv_regions() was already exported. Link: https://lore.kernel.org/r/4-v8-6659224517ea+532-iommufd_alloc_jgg@nvidia.com Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommu.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index caaf563d38ae0..b0804fbad6f9d 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2642,6 +2642,14 @@ int iommu_set_pgtable_quirks(struct iommu_domain *domain, } EXPORT_SYMBOL_GPL(iommu_set_pgtable_quirks); +/** + * iommu_get_resv_regions - get reserved regions + * @dev: device for which to get reserved regions + * @list: reserved region list for device + * + * This returns a list of reserved IOVA regions specific to this device. + * A domain user should not map IOVA in these ranges. + */ void iommu_get_resv_regions(struct device *dev, struct list_head *list) { const struct iommu_ops *ops = dev_iommu_ops(dev); @@ -2649,9 +2657,10 @@ void iommu_get_resv_regions(struct device *dev, struct list_head *list) if (ops->get_resv_regions) ops->get_resv_regions(dev, list); } +EXPORT_SYMBOL_GPL(iommu_get_resv_regions); /** - * iommu_put_resv_regions - release resered regions + * iommu_put_resv_regions - release reserved regions * @dev: device for which to free reserved regions * @list: reserved region list for device * From 34f327a985ff11e538aa52b3e4842328815ce5de Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 17 Jul 2023 15:12:01 -0300 Subject: [PATCH 05/26] iommufd: Keep track of each device's reserved regions instead of groups The driver facing API in the iommu core makes the reserved regions per-device. An algorithm in the core code consolidates the regions of all the devices in a group to return the group view. To allow for devices to be hotplugged into the group iommufd would re-load the entire group's reserved regions for each device, just in case they changed. Further iommufd already has to deal with duplicated/overlapping reserved regions as it must union all the groups together. Thus simplify all of this to just use the device reserved regions interface directly from the iommu driver. Link: https://lore.kernel.org/r/5-v8-6659224517ea+532-iommufd_alloc_jgg@nvidia.com Suggested-by: Kevin Tian Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 5 ++--- drivers/iommu/iommufd/io_pagetable.c | 27 ++++++++++--------------- drivers/iommu/iommufd/iommufd_private.h | 7 +++---- 3 files changed, 16 insertions(+), 23 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index dbbfd50c0fce7..7fd5c184b5607 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -356,9 +356,8 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, } } - rc = iopt_table_enforce_group_resv_regions(&hwpt->ioas->iopt, idev->dev, - idev->igroup->group, - &sw_msi_start); + rc = iopt_table_enforce_dev_resv_regions( + &hwpt->ioas->iopt, idev->dev, &sw_msi_start); if (rc) goto err_unlock; diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c index 724c4c5742417..112bea5a84e43 100644 --- a/drivers/iommu/iommufd/io_pagetable.c +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -1169,25 +1169,22 @@ void iopt_remove_access(struct io_pagetable *iopt, up_write(&iopt->domains_rwsem); } -/* Narrow the valid_iova_itree to include reserved ranges from a group. */ -int iopt_table_enforce_group_resv_regions(struct io_pagetable *iopt, - struct device *device, - struct iommu_group *group, - phys_addr_t *sw_msi_start) +/* Narrow the valid_iova_itree to include reserved ranges from a device. */ +int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt, + struct device *dev, + phys_addr_t *sw_msi_start) { struct iommu_resv_region *resv; - struct iommu_resv_region *tmp; - LIST_HEAD(group_resv_regions); + LIST_HEAD(resv_regions); unsigned int num_hw_msi = 0; unsigned int num_sw_msi = 0; int rc; down_write(&iopt->iova_rwsem); - rc = iommu_get_group_resv_regions(group, &group_resv_regions); - if (rc) - goto out_unlock; + /* FIXME: drivers allocate memory but there is no failure propogated */ + iommu_get_resv_regions(dev, &resv_regions); - list_for_each_entry(resv, &group_resv_regions, list) { + list_for_each_entry(resv, &resv_regions, list) { if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE) continue; @@ -1199,7 +1196,7 @@ int iopt_table_enforce_group_resv_regions(struct io_pagetable *iopt, } rc = iopt_reserve_iova(iopt, resv->start, - resv->length - 1 + resv->start, device); + resv->length - 1 + resv->start, dev); if (rc) goto out_reserved; } @@ -1214,11 +1211,9 @@ int iopt_table_enforce_group_resv_regions(struct io_pagetable *iopt, goto out_free_resv; out_reserved: - __iopt_remove_reserved_iova(iopt, device); + __iopt_remove_reserved_iova(iopt, dev); out_free_resv: - list_for_each_entry_safe(resv, tmp, &group_resv_regions, list) - kfree(resv); -out_unlock: + iommu_put_resv_regions(dev, &resv_regions); up_write(&iopt->iova_rwsem); return rc; } diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 2d3fd26729392..206c0212ebfdb 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -76,10 +76,9 @@ int iopt_table_add_domain(struct io_pagetable *iopt, struct iommu_domain *domain); void iopt_table_remove_domain(struct io_pagetable *iopt, struct iommu_domain *domain); -int iopt_table_enforce_group_resv_regions(struct io_pagetable *iopt, - struct device *device, - struct iommu_group *group, - phys_addr_t *sw_msi_start); +int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt, + struct device *dev, + phys_addr_t *sw_msi_start); int iopt_set_allow_iova(struct io_pagetable *iopt, struct rb_root_cached *allowed_iova); int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start, From 269c5238c5b134ca8b18b98814b87ec995a59968 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 17 Jul 2023 15:12:02 -0300 Subject: [PATCH 06/26] iommufd: Use the iommufd_group to avoid duplicate MSI setup This only needs to be done once per group, not once per device. The once per device was a way to make the device list work. Since we are abandoning this we can optimize things a bit. Link: https://lore.kernel.org/r/6-v8-6659224517ea+532-iommufd_alloc_jgg@nvidia.com Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 7fd5c184b5607..eb90d6f9e10f5 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -361,10 +361,6 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, if (rc) goto err_unlock; - rc = iommufd_device_setup_msi(idev, hwpt, sw_msi_start); - if (rc) - goto err_unresv; - /* * Only attach to the group once for the first device that is in the * group. All the other devices will follow this attachment. The user @@ -373,6 +369,10 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, * attachment. */ if (list_empty(&idev->igroup->device_list)) { + rc = iommufd_device_setup_msi(idev, hwpt, sw_msi_start); + if (rc) + goto err_unresv; + rc = iommu_attach_group(hwpt->domain, idev->igroup->group); if (rc) goto err_unresv; From 1d149ab2e0062a98b3676d6ee0aaa34be16f4d4f Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 17 Jul 2023 15:12:03 -0300 Subject: [PATCH 07/26] iommufd: Make sw_msi_start a group global The sw_msi_start is only set by the ARM drivers and it is always constant. Due to the way vfio/iommufd allow domains to be re-used between devices we have a built in assumption that there is only one value for sw_msi_start and it is global to the system. To make replace simpler where we may not reparse the iommu_get_resv_regions() move the sw_msi_start to the iommufd_group so it is always available once any HWPT has been attached. Link: https://lore.kernel.org/r/7-v8-6659224517ea+532-iommufd_alloc_jgg@nvidia.com Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 14 +++++++------- drivers/iommu/iommufd/iommufd_private.h | 1 + 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index eb90d6f9e10f5..4b41ef5292a5c 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -88,6 +88,7 @@ static struct iommufd_group *iommufd_get_group(struct iommufd_ctx *ictx, kref_init(&new_igroup->ref); mutex_init(&new_igroup->lock); INIT_LIST_HEAD(&new_igroup->device_list); + new_igroup->sw_msi_start = PHYS_ADDR_MAX; /* group reference moves into new_igroup */ new_igroup->group = group; @@ -292,10 +293,10 @@ u32 iommufd_device_to_id(struct iommufd_device *idev) } EXPORT_SYMBOL_NS_GPL(iommufd_device_to_id, IOMMUFD); -static int iommufd_device_setup_msi(struct iommufd_device *idev, - struct iommufd_hw_pagetable *hwpt, - phys_addr_t sw_msi_start) +static int iommufd_group_setup_msi(struct iommufd_group *igroup, + struct iommufd_hw_pagetable *hwpt) { + phys_addr_t sw_msi_start = igroup->sw_msi_start; int rc; /* @@ -328,7 +329,6 @@ static int iommufd_device_setup_msi(struct iommufd_device *idev, int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, struct iommufd_device *idev) { - phys_addr_t sw_msi_start = PHYS_ADDR_MAX; int rc; mutex_lock(&idev->igroup->lock); @@ -356,8 +356,8 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, } } - rc = iopt_table_enforce_dev_resv_regions( - &hwpt->ioas->iopt, idev->dev, &sw_msi_start); + rc = iopt_table_enforce_dev_resv_regions(&hwpt->ioas->iopt, idev->dev, + &idev->igroup->sw_msi_start); if (rc) goto err_unlock; @@ -369,7 +369,7 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, * attachment. */ if (list_empty(&idev->igroup->device_list)) { - rc = iommufd_device_setup_msi(idev, hwpt, sw_msi_start); + rc = iommufd_group_setup_msi(idev->igroup, hwpt); if (rc) goto err_unresv; diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 206c0212ebfdb..6e11d6ef71ee8 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -267,6 +267,7 @@ struct iommufd_group { struct iommu_group *group; struct iommufd_hw_pagetable *hwpt; struct list_head device_list; + phys_addr_t sw_msi_start; }; /* From d03f1336fd91b87ad218dd398265332b1cd2c68c Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 17 Jul 2023 15:12:04 -0300 Subject: [PATCH 08/26] iommufd: Move putting a hwpt to a helper function Next patch will need to call this from two places. Link: https://lore.kernel.org/r/8-v8-6659224517ea+532-iommufd_alloc_jgg@nvidia.com Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 6 +----- drivers/iommu/iommufd/iommufd_private.h | 10 ++++++++++ 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 4b41ef5292a5c..89cef2d2a2b12 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -526,11 +526,7 @@ void iommufd_device_detach(struct iommufd_device *idev) struct iommufd_hw_pagetable *hwpt; hwpt = iommufd_hw_pagetable_detach(idev); - if (hwpt->auto_domain) - iommufd_object_destroy_user(idev->ictx, &hwpt->obj); - else - refcount_dec(&hwpt->obj.users); - + iommufd_hw_pagetable_put(idev->ictx, hwpt); refcount_dec(&idev->obj.users); } EXPORT_SYMBOL_NS_GPL(iommufd_device_detach, IOMMUFD); diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 6e11d6ef71ee8..8bdbef8bd878b 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -260,6 +260,16 @@ struct iommufd_hw_pagetable * iommufd_hw_pagetable_detach(struct iommufd_device *idev); void iommufd_hw_pagetable_destroy(struct iommufd_object *obj); +static inline void iommufd_hw_pagetable_put(struct iommufd_ctx *ictx, + struct iommufd_hw_pagetable *hwpt) +{ + lockdep_assert_not_held(&hwpt->ioas->mutex); + if (hwpt->auto_domain) + iommufd_object_destroy_user(ictx, &hwpt->obj); + else + refcount_dec(&hwpt->obj.users); +} + struct iommufd_group { struct kref ref; struct mutex lock; From 17bad52708b457c4a0cbd7195ad813d6c4308b82 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 17 Jul 2023 15:12:05 -0300 Subject: [PATCH 09/26] iommufd: Add enforced_cache_coherency to iommufd_hw_pagetable_alloc() Logically the HWPT should have the coherency set properly for the device that it is being created for when it is created. This was happening implicitly if the immediate_attach was set because iommufd_hw_pagetable_attach() does it as the first thing. Do it unconditionally so !immediate_attach works properly. Link: https://lore.kernel.org/r/9-v8-6659224517ea+532-iommufd_alloc_jgg@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 19 ++++------------- drivers/iommu/iommufd/hw_pagetable.c | 27 +++++++++++++++++++++++++ drivers/iommu/iommufd/iommufd_private.h | 1 + 3 files changed, 32 insertions(+), 15 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 89cef2d2a2b12..08bba99f2fb01 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -338,22 +338,11 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, goto err_unlock; } - /* - * Try to upgrade the domain we have, it is an iommu driver bug to - * report IOMMU_CAP_ENFORCE_CACHE_COHERENCY but fail - * enforce_cache_coherency when there are no devices attached to the - * domain. - */ - if (idev->enforce_cache_coherency && !hwpt->enforce_cache_coherency) { - if (hwpt->domain->ops->enforce_cache_coherency) - hwpt->enforce_cache_coherency = - hwpt->domain->ops->enforce_cache_coherency( - hwpt->domain); - if (!hwpt->enforce_cache_coherency) { - WARN_ON(list_empty(&idev->igroup->device_list)); - rc = -EINVAL; + /* Try to upgrade the domain we have */ + if (idev->enforce_cache_coherency) { + rc = iommufd_hw_pagetable_enforce_cc(hwpt); + if (rc) goto err_unlock; - } } rc = iopt_table_enforce_dev_resv_regions(&hwpt->ioas->iopt, idev->dev, diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index bdb76cdb1dc34..e0699d7f4c64a 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -25,6 +25,20 @@ void iommufd_hw_pagetable_destroy(struct iommufd_object *obj) refcount_dec(&hwpt->ioas->obj.users); } +int iommufd_hw_pagetable_enforce_cc(struct iommufd_hw_pagetable *hwpt) +{ + if (hwpt->enforce_cache_coherency) + return 0; + + if (hwpt->domain->ops->enforce_cache_coherency) + hwpt->enforce_cache_coherency = + hwpt->domain->ops->enforce_cache_coherency( + hwpt->domain); + if (!hwpt->enforce_cache_coherency) + return -EINVAL; + return 0; +} + /** * iommufd_hw_pagetable_alloc() - Get an iommu_domain for a device * @ictx: iommufd context @@ -60,6 +74,19 @@ iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, goto out_abort; } + /* + * Set the coherency mode before we do iopt_table_add_domain() as some + * iommus have a per-PTE bit that controls it and need to decide before + * doing any maps. It is an iommu driver bug to report + * IOMMU_CAP_ENFORCE_CACHE_COHERENCY but fail enforce_cache_coherency on + * a new domain. + */ + if (idev->enforce_cache_coherency) { + rc = iommufd_hw_pagetable_enforce_cc(hwpt); + if (WARN_ON(rc)) + goto out_abort; + } + /* * immediate_attach exists only to accommodate iommu drivers that cannot * directly allocate a domain. These drivers do not finish creating the diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 8bdbef8bd878b..a6210f51b956e 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -254,6 +254,7 @@ struct iommufd_hw_pagetable { struct iommufd_hw_pagetable * iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, struct iommufd_device *idev, bool immediate_attach); +int iommufd_hw_pagetable_enforce_cc(struct iommufd_hw_pagetable *hwpt); int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, struct iommufd_device *idev); struct iommufd_hw_pagetable * From 70eadc7fc7ef29bfe0e361376983822b5e36dd67 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 17 Jul 2023 15:12:06 -0300 Subject: [PATCH 10/26] iommufd: Allow a hwpt to be aborted after allocation During creation the hwpt must have the ioas->mutex held until the object is finalized. This means we need to be able to call iommufd_object_abort_and_destroy() while holding the mutex. Since iommufd_hw_pagetable_destroy() also needs the mutex this is problematic. Fix it by creating a special abort op for the object that can assume the caller is holding the lock, as required by the contract. The next patch will add another iommufd_object_abort_and_destroy() for a hwpt. Fixes: e8d57210035b ("iommufd: Add kAPI toward external drivers for physical devices") Link: https://lore.kernel.org/r/10-v8-6659224517ea+532-iommufd_alloc_jgg@nvidia.com Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/hw_pagetable.c | 19 +++++++++++++++++++ drivers/iommu/iommufd/iommufd_private.h | 1 + drivers/iommu/iommufd/main.c | 7 ++++++- 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index e0699d7f4c64a..2087b51d9807a 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -25,6 +25,21 @@ void iommufd_hw_pagetable_destroy(struct iommufd_object *obj) refcount_dec(&hwpt->ioas->obj.users); } +void iommufd_hw_pagetable_abort(struct iommufd_object *obj) +{ + struct iommufd_hw_pagetable *hwpt = + container_of(obj, struct iommufd_hw_pagetable, obj); + + /* The ioas->mutex must be held until finalize is called. */ + lockdep_assert_held(&hwpt->ioas->mutex); + + if (!list_empty(&hwpt->hwpt_item)) { + list_del_init(&hwpt->hwpt_item); + iopt_table_remove_domain(&hwpt->ioas->iopt, hwpt->domain); + } + iommufd_hw_pagetable_destroy(obj); +} + int iommufd_hw_pagetable_enforce_cc(struct iommufd_hw_pagetable *hwpt) { if (hwpt->enforce_cache_coherency) @@ -49,6 +64,10 @@ int iommufd_hw_pagetable_enforce_cc(struct iommufd_hw_pagetable *hwpt) * Allocate a new iommu_domain and return it as a hw_pagetable. The HWPT * will be linked to the given ioas and upon return the underlying iommu_domain * is fully popoulated. + * + * The caller must hold the ioas->mutex until after + * iommufd_object_abort_and_destroy() or iommufd_object_finalize() is called on + * the returned hwpt. */ struct iommufd_hw_pagetable * iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index a6210f51b956e..47fed9feefd04 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -260,6 +260,7 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, struct iommufd_hw_pagetable * iommufd_hw_pagetable_detach(struct iommufd_device *idev); void iommufd_hw_pagetable_destroy(struct iommufd_object *obj); +void iommufd_hw_pagetable_abort(struct iommufd_object *obj); static inline void iommufd_hw_pagetable_put(struct iommufd_ctx *ictx, struct iommufd_hw_pagetable *hwpt) diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index 34fefc0a78336..438f55e6aa880 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -24,6 +24,7 @@ struct iommufd_object_ops { void (*destroy)(struct iommufd_object *obj); + void (*abort)(struct iommufd_object *obj); }; static const struct iommufd_object_ops iommufd_object_ops[]; static struct miscdevice vfio_misc_dev; @@ -95,7 +96,10 @@ void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj) void iommufd_object_abort_and_destroy(struct iommufd_ctx *ictx, struct iommufd_object *obj) { - iommufd_object_ops[obj->type].destroy(obj); + if (iommufd_object_ops[obj->type].abort) + iommufd_object_ops[obj->type].abort(obj); + else + iommufd_object_ops[obj->type].destroy(obj); iommufd_object_abort(ictx, obj); } @@ -425,6 +429,7 @@ static const struct iommufd_object_ops iommufd_object_ops[] = { }, [IOMMUFD_OBJ_HW_PAGETABLE] = { .destroy = iommufd_hw_pagetable_destroy, + .abort = iommufd_hw_pagetable_abort, }, #ifdef CONFIG_IOMMUFD_TEST [IOMMUFD_OBJ_SELFTEST] = { From 31422dff187b243c58f3a97d16bbe9e9ada639fe Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 17 Jul 2023 15:12:07 -0300 Subject: [PATCH 11/26] iommufd: Fix locking around hwpt allocation Due to the auto_domains mechanism the ioas->mutex must be held until the hwpt is completely setup by iommufd_object_abort_and_destroy() or iommufd_object_finalize(). This prevents a concurrent iommufd_device_auto_get_domain() from seeing an incompletely initialized object through the ioas->hwpt_list. To make this more consistent move the unlock until after finalize. Fixes: e8d57210035b ("iommufd: Add kAPI toward external drivers for physical devices") Link: https://lore.kernel.org/r/11-v8-6659224517ea+532-iommufd_alloc_jgg@nvidia.com Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 08bba99f2fb01..9b9244c5b0ce4 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -441,8 +441,8 @@ static int iommufd_device_auto_get_domain(struct iommufd_device *idev, hwpt->auto_domain = true; *pt_id = hwpt->obj.id; - mutex_unlock(&ioas->mutex); iommufd_object_finalize(idev->ictx, &hwpt->obj); + mutex_unlock(&ioas->mutex); return 0; out_unlock: mutex_unlock(&ioas->mutex); From ea2d6124b52389aa837403f5debf73620d3f441f Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 17 Jul 2023 15:12:08 -0300 Subject: [PATCH 12/26] iommufd: Reorganize iommufd_device_attach into iommufd_device_change_pt The code flow for first time attaching a PT and replacing a PT is very similar except for the lowest do_attach step. Reorganize this so that the do_attach step is a function pointer. Replace requires destroying the old HWPT once it is replaced. This destruction cannot be done under all the locks that are held in the function pointer, so the signature allows returning a HWPT which will be destroyed by the caller after everything is unlocked. Link: https://lore.kernel.org/r/12-v8-6659224517ea+532-iommufd_alloc_jgg@nvidia.com Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 141 ++++++++++++++++++++++++--------- 1 file changed, 102 insertions(+), 39 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 9b9244c5b0ce4..138bcdb7f14f9 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -396,16 +396,41 @@ iommufd_hw_pagetable_detach(struct iommufd_device *idev) return hwpt; } +static struct iommufd_hw_pagetable * +iommufd_device_do_attach(struct iommufd_device *idev, + struct iommufd_hw_pagetable *hwpt) +{ + int rc; + + rc = iommufd_hw_pagetable_attach(hwpt, idev); + if (rc) + return ERR_PTR(rc); + return NULL; +} + +typedef struct iommufd_hw_pagetable *(*attach_fn)( + struct iommufd_device *idev, struct iommufd_hw_pagetable *hwpt); + /* * When automatically managing the domains we search for a compatible domain in * the iopt and if one is found use it, otherwise create a new domain. * Automatic domain selection will never pick a manually created domain. */ -static int iommufd_device_auto_get_domain(struct iommufd_device *idev, - struct iommufd_ioas *ioas, u32 *pt_id) +static struct iommufd_hw_pagetable * +iommufd_device_auto_get_domain(struct iommufd_device *idev, + struct iommufd_ioas *ioas, u32 *pt_id, + attach_fn do_attach) { + /* + * iommufd_hw_pagetable_attach() is called by + * iommufd_hw_pagetable_alloc() in immediate attachment mode, same as + * iommufd_device_do_attach(). So if we are in this mode then we prefer + * to use the immediate_attach path as it supports drivers that can't + * directly allocate a domain. + */ + bool immediate_attach = do_attach == iommufd_device_do_attach; + struct iommufd_hw_pagetable *destroy_hwpt; struct iommufd_hw_pagetable *hwpt; - int rc; /* * There is no differentiation when domains are allocated, so any domain @@ -419,52 +444,58 @@ static int iommufd_device_auto_get_domain(struct iommufd_device *idev, if (!iommufd_lock_obj(&hwpt->obj)) continue; - rc = iommufd_hw_pagetable_attach(hwpt, idev); - iommufd_put_object(&hwpt->obj); - - /* - * -EINVAL means the domain is incompatible with the device. - * Other error codes should propagate to userspace as failure. - * Success means the domain is attached. - */ - if (rc == -EINVAL) - continue; + destroy_hwpt = (*do_attach)(idev, hwpt); + if (IS_ERR(destroy_hwpt)) { + iommufd_put_object(&hwpt->obj); + /* + * -EINVAL means the domain is incompatible with the + * device. Other error codes should propagate to + * userspace as failure. Success means the domain is + * attached. + */ + if (PTR_ERR(destroy_hwpt) == -EINVAL) + continue; + goto out_unlock; + } *pt_id = hwpt->obj.id; + iommufd_put_object(&hwpt->obj); goto out_unlock; } - hwpt = iommufd_hw_pagetable_alloc(idev->ictx, ioas, idev, true); + hwpt = iommufd_hw_pagetable_alloc(idev->ictx, ioas, idev, + immediate_attach); if (IS_ERR(hwpt)) { - rc = PTR_ERR(hwpt); + destroy_hwpt = ERR_CAST(hwpt); goto out_unlock; } + + if (!immediate_attach) { + destroy_hwpt = (*do_attach)(idev, hwpt); + if (IS_ERR(destroy_hwpt)) + goto out_abort; + } else { + destroy_hwpt = NULL; + } + hwpt->auto_domain = true; *pt_id = hwpt->obj.id; iommufd_object_finalize(idev->ictx, &hwpt->obj); mutex_unlock(&ioas->mutex); - return 0; + return destroy_hwpt; + +out_abort: + iommufd_object_abort_and_destroy(idev->ictx, &hwpt->obj); out_unlock: mutex_unlock(&ioas->mutex); - return rc; + return destroy_hwpt; } -/** - * iommufd_device_attach - Connect a device from an iommu_domain - * @idev: device to attach - * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HW_PAGETABLE - * Output the IOMMUFD_OBJ_HW_PAGETABLE ID - * - * This connects the device to an iommu_domain, either automatically or manually - * selected. Once this completes the device could do DMA. - * - * The caller should return the resulting pt_id back to userspace. - * This function is undone by calling iommufd_device_detach(). - */ -int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id) +static int iommufd_device_change_pt(struct iommufd_device *idev, u32 *pt_id, + attach_fn do_attach) { + struct iommufd_hw_pagetable *destroy_hwpt; struct iommufd_object *pt_obj; - int rc; pt_obj = iommufd_get_object(idev->ictx, *pt_id, IOMMUFD_OBJ_ANY); if (IS_ERR(pt_obj)) @@ -475,8 +506,8 @@ int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id) struct iommufd_hw_pagetable *hwpt = container_of(pt_obj, struct iommufd_hw_pagetable, obj); - rc = iommufd_hw_pagetable_attach(hwpt, idev); - if (rc) + destroy_hwpt = (*do_attach)(idev, hwpt); + if (IS_ERR(destroy_hwpt)) goto out_put_pt_obj; break; } @@ -484,22 +515,54 @@ int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id) struct iommufd_ioas *ioas = container_of(pt_obj, struct iommufd_ioas, obj); - rc = iommufd_device_auto_get_domain(idev, ioas, pt_id); - if (rc) + destroy_hwpt = iommufd_device_auto_get_domain(idev, ioas, pt_id, + do_attach); + if (IS_ERR(destroy_hwpt)) goto out_put_pt_obj; break; } default: - rc = -EINVAL; + destroy_hwpt = ERR_PTR(-EINVAL); goto out_put_pt_obj; } + iommufd_put_object(pt_obj); - refcount_inc(&idev->obj.users); - rc = 0; + /* This destruction has to be after we unlock everything */ + if (destroy_hwpt) + iommufd_hw_pagetable_put(idev->ictx, destroy_hwpt); + return 0; out_put_pt_obj: iommufd_put_object(pt_obj); - return rc; + return PTR_ERR(destroy_hwpt); +} + +/** + * iommufd_device_attach - Connect a device to an iommu_domain + * @idev: device to attach + * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HW_PAGETABLE + * Output the IOMMUFD_OBJ_HW_PAGETABLE ID + * + * This connects the device to an iommu_domain, either automatically or manually + * selected. Once this completes the device could do DMA. + * + * The caller should return the resulting pt_id back to userspace. + * This function is undone by calling iommufd_device_detach(). + */ +int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id) +{ + int rc; + + rc = iommufd_device_change_pt(idev, pt_id, &iommufd_device_do_attach); + if (rc) + return rc; + + /* + * Pairs with iommufd_device_detach() - catches caller bugs attempting + * to destroy a device with an attachment. + */ + refcount_inc(&idev->obj.users); + return 0; } EXPORT_SYMBOL_NS_GPL(iommufd_device_attach, IOMMUFD); From addb665924f39f01dc1d6b2de3d21ebbc1232de6 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Mon, 17 Jul 2023 15:12:09 -0300 Subject: [PATCH 13/26] iommu: Introduce a new iommu_group_replace_domain() API qemu has a need to replace the translations associated with a domain when the guest does large-scale operations like switching between an IDENTITY domain and, say, dma-iommu.c. Currently, it does this by replacing all the mappings in a single domain, but this is very inefficient and means that domains have to be per-device rather than per-translation. Provide a high-level API to allow replacements of one domain with another. This is similar to a detach/attach cycle except it doesn't force the group to go to the blocking domain in-between. By removing this forced blocking domain the iommu driver has the opportunity to implement a non-disruptive replacement of the domain to the greatest extent its hardware allows. This allows the qemu emulation of the vIOMMU to be more complete, as real hardware often has a non-distruptive replacement capability. It could be possible to address this by simply removing the protection from the iommu_attach_group(), but it is not so clear if that is safe for the few users. Thus, add a new API to serve this new purpose. All drivers are already required to support changing between active UNMANAGED domains when using their attach_dev ops. This API is expected to be used only by IOMMUFD, so add to the iommu-priv header and mark it as IOMMUFD_INTERNAL. Link: https://lore.kernel.org/r/13-v8-6659224517ea+532-iommufd_alloc_jgg@nvidia.com Suggested-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Tested-by: Nicolin Chen Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommu-priv.h | 10 ++++++++++ drivers/iommu/iommu.c | 27 +++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) create mode 100644 drivers/iommu/iommu-priv.h diff --git a/drivers/iommu/iommu-priv.h b/drivers/iommu/iommu-priv.h new file mode 100644 index 0000000000000..7c8011bfd1537 --- /dev/null +++ b/drivers/iommu/iommu-priv.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __LINUX_IOMMU_PRIV_H +#define __LINUX_IOMMU_PRIV_H + +#include + +int iommu_group_replace_domain(struct iommu_group *group, + struct iommu_domain *new_domain); + +#endif /* __LINUX_IOMMU_PRIV_H */ diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index b0804fbad6f9d..5876196765737 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -34,6 +34,7 @@ #include #include "dma-iommu.h" +#include "iommu-priv.h" #include "iommu-sva.h" @@ -2114,6 +2115,32 @@ int iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group) } EXPORT_SYMBOL_GPL(iommu_attach_group); +/** + * iommu_group_replace_domain - replace the domain that a group is attached to + * @new_domain: new IOMMU domain to replace with + * @group: IOMMU group that will be attached to the new domain + * + * This API allows the group to switch domains without being forced to go to + * the blocking domain in-between. + * + * If the currently attached domain is a core domain (e.g. a default_domain), + * it will act just like the iommu_attach_group(). + */ +int iommu_group_replace_domain(struct iommu_group *group, + struct iommu_domain *new_domain) +{ + int ret; + + if (!new_domain) + return -EINVAL; + + mutex_lock(&group->mutex); + ret = __iommu_group_set_domain(group, new_domain); + mutex_unlock(&group->mutex); + return ret; +} +EXPORT_SYMBOL_NS_GPL(iommu_group_replace_domain, IOMMUFD_INTERNAL); + static int __iommu_device_set_domain(struct iommu_group *group, struct device *dev, struct iommu_domain *new_domain, From e88d4ec154a87884086d3e38f2dfc66aec11bf8c Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 17 Jul 2023 15:12:10 -0300 Subject: [PATCH 14/26] iommufd: Add iommufd_device_replace() Replace allows all the devices in a group to move in one step to a new HWPT. Further, the HWPT move is done without going through a blocking domain so that the IOMMU driver can implement some level of non-distruption to ongoing DMA if that has meaning for it (eg for future special driver domains) Replace uses a lot of the same logic as normal attach, except the actual domain change over has different restrictions, and we are careful to sequence things so that failure is going to leave everything the way it was, and not get trapped in a blocking domain or something if there is ENOMEM. Link: https://lore.kernel.org/r/14-v8-6659224517ea+532-iommufd_alloc_jgg@nvidia.com Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 101 +++++++++++++++++++++++++++++++++ drivers/iommu/iommufd/main.c | 1 + 2 files changed, 102 insertions(+) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 138bcdb7f14f9..57c0e81f5073b 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -4,6 +4,7 @@ #include #include #include +#include "../iommu-priv.h" #include "io_pagetable.h" #include "iommufd_private.h" @@ -408,6 +409,84 @@ iommufd_device_do_attach(struct iommufd_device *idev, return NULL; } +static struct iommufd_hw_pagetable * +iommufd_device_do_replace(struct iommufd_device *idev, + struct iommufd_hw_pagetable *hwpt) +{ + struct iommufd_group *igroup = idev->igroup; + struct iommufd_hw_pagetable *old_hwpt; + unsigned int num_devices = 0; + struct iommufd_device *cur; + int rc; + + mutex_lock(&idev->igroup->lock); + + if (igroup->hwpt == NULL) { + rc = -EINVAL; + goto err_unlock; + } + + if (hwpt == igroup->hwpt) { + mutex_unlock(&idev->igroup->lock); + return NULL; + } + + /* Try to upgrade the domain we have */ + list_for_each_entry(cur, &igroup->device_list, group_item) { + num_devices++; + if (cur->enforce_cache_coherency) { + rc = iommufd_hw_pagetable_enforce_cc(hwpt); + if (rc) + goto err_unlock; + } + } + + old_hwpt = igroup->hwpt; + if (hwpt->ioas != old_hwpt->ioas) { + list_for_each_entry(cur, &igroup->device_list, group_item) { + rc = iopt_table_enforce_dev_resv_regions( + &hwpt->ioas->iopt, cur->dev, NULL); + if (rc) + goto err_unresv; + } + } + + rc = iommufd_group_setup_msi(idev->igroup, hwpt); + if (rc) + goto err_unresv; + + rc = iommu_group_replace_domain(igroup->group, hwpt->domain); + if (rc) + goto err_unresv; + + if (hwpt->ioas != old_hwpt->ioas) { + list_for_each_entry(cur, &igroup->device_list, group_item) + iopt_remove_reserved_iova(&old_hwpt->ioas->iopt, + cur->dev); + } + + igroup->hwpt = hwpt; + + /* + * Move the refcounts held by the device_list to the new hwpt. Retain a + * refcount for this thread as the caller will free it. + */ + refcount_add(num_devices, &hwpt->obj.users); + if (num_devices > 1) + WARN_ON(refcount_sub_and_test(num_devices - 1, + &old_hwpt->obj.users)); + mutex_unlock(&idev->igroup->lock); + + /* Caller must destroy old_hwpt */ + return old_hwpt; +err_unresv: + list_for_each_entry(cur, &igroup->device_list, group_item) + iopt_remove_reserved_iova(&hwpt->ioas->iopt, cur->dev); +err_unlock: + mutex_unlock(&idev->igroup->lock); + return ERR_PTR(rc); +} + typedef struct iommufd_hw_pagetable *(*attach_fn)( struct iommufd_device *idev, struct iommufd_hw_pagetable *hwpt); @@ -566,6 +645,28 @@ int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id) } EXPORT_SYMBOL_NS_GPL(iommufd_device_attach, IOMMUFD); +/** + * iommufd_device_replace - Change the device's iommu_domain + * @idev: device to change + * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HW_PAGETABLE + * Output the IOMMUFD_OBJ_HW_PAGETABLE ID + * + * This is the same as:: + * + * iommufd_device_detach(); + * iommufd_device_attach(); + * + * If it fails then no change is made to the attachment. The iommu driver may + * implement this so there is no disruption in translation. This can only be + * called if iommufd_device_attach() has already succeeded. + */ +int iommufd_device_replace(struct iommufd_device *idev, u32 *pt_id) +{ + return iommufd_device_change_pt(idev, pt_id, + &iommufd_device_do_replace); +} +EXPORT_SYMBOL_NS_GPL(iommufd_device_replace, IOMMUFD); + /** * iommufd_device_detach - Disconnect a device to an iommu_domain * @idev: device to detach diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index 438f55e6aa880..521225c6a0714 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -490,5 +490,6 @@ module_exit(iommufd_exit); MODULE_ALIAS_MISCDEV(VFIO_MINOR); MODULE_ALIAS("devname:vfio/vfio"); #endif +MODULE_IMPORT_NS(IOMMUFD_INTERNAL); MODULE_DESCRIPTION("I/O Address Space Management for passthrough devices"); MODULE_LICENSE("GPL"); From 83f7bc6fdfd2fca11f35c061194d60f88acb3086 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 17 Jul 2023 15:12:11 -0300 Subject: [PATCH 15/26] iommufd: Make destroy_rwsem use a lock class per object type The selftest invokes things like replace under the object lock of its idev which protects the idev in a similar way to a real user. Unfortunately this triggers lockdep. A lock class per type will solve the problem. Link: https://lore.kernel.org/r/15-v8-6659224517ea+532-iommufd_alloc_jgg@nvidia.com Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/iommufd_private.h | 1 + drivers/iommu/iommufd/main.c | 11 ++++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 47fed9feefd04..9394f0e6692a9 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -119,6 +119,7 @@ enum iommufd_object_type { #ifdef CONFIG_IOMMUFD_TEST IOMMUFD_OBJ_SELFTEST, #endif + IOMMUFD_OBJ_MAX, }; /* Base struct for all objects with a userspace ID handle. */ diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index 521225c6a0714..56cdf3c796a7e 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -33,6 +33,7 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, size_t size, enum iommufd_object_type type) { + static struct lock_class_key obj_keys[IOMMUFD_OBJ_MAX]; struct iommufd_object *obj; int rc; @@ -40,7 +41,15 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, if (!obj) return ERR_PTR(-ENOMEM); obj->type = type; - init_rwsem(&obj->destroy_rwsem); + /* + * In most cases the destroy_rwsem is obtained with try so it doesn't + * interact with lockdep, however on destroy we have to sleep. This + * means if we have to destroy an object while holding a get on another + * object it triggers lockdep. Using one locking class per object type + * is a simple and reasonable way to avoid this. + */ + __init_rwsem(&obj->destroy_rwsem, "iommufd_object::destroy_rwsem", + &obj_keys[type]); refcount_set(&obj->users, 1); /* From fa1ffdb9e2937d04657c33a146aa0d2cd39abba0 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Mon, 17 Jul 2023 15:12:12 -0300 Subject: [PATCH 16/26] iommufd/selftest: Test iommufd_device_replace() Allow the selftest to call the function on the mock idev, add some tests to exercise it. Link: https://lore.kernel.org/r/16-v8-6659224517ea+532-iommufd_alloc_jgg@nvidia.com Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Signed-off-by: Nicolin Chen Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/iommufd_test.h | 4 ++ drivers/iommu/iommufd/selftest.c | 39 +++++++++++++++++ include/linux/iommufd.h | 1 + tools/testing/selftests/iommu/iommufd.c | 37 ++++++++++++++-- .../selftests/iommu/iommufd_fail_nth.c | 42 +++++++++++++++++++ tools/testing/selftests/iommu/iommufd_utils.h | 30 +++++++++++++ 6 files changed, 149 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h index b3d69cca77295..e3f1035cbd646 100644 --- a/drivers/iommu/iommufd/iommufd_test.h +++ b/drivers/iommu/iommufd/iommufd_test.h @@ -17,6 +17,7 @@ enum { IOMMU_TEST_OP_ACCESS_PAGES, IOMMU_TEST_OP_ACCESS_RW, IOMMU_TEST_OP_SET_TEMP_MEMORY_LIMIT, + IOMMU_TEST_OP_MOCK_DOMAIN_REPLACE, }; enum { @@ -52,6 +53,9 @@ struct iommu_test_cmd { __u32 out_stdev_id; __u32 out_hwpt_id; } mock_domain; + struct { + __u32 pt_id; + } mock_domain_replace; struct { __aligned_u64 iova; __aligned_u64 length; diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 74c2076105d48..eb33dffb955dc 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -455,6 +455,42 @@ static int iommufd_test_mock_domain(struct iommufd_ucmd *ucmd, return rc; } +/* Replace the mock domain with a manually allocated hw_pagetable */ +static int iommufd_test_mock_domain_replace(struct iommufd_ucmd *ucmd, + unsigned int device_id, u32 pt_id, + struct iommu_test_cmd *cmd) +{ + struct iommufd_object *dev_obj; + struct selftest_obj *sobj; + int rc; + + /* + * Prefer to use the OBJ_SELFTEST because the destroy_rwsem will ensure + * it doesn't race with detach, which is not allowed. + */ + dev_obj = + iommufd_get_object(ucmd->ictx, device_id, IOMMUFD_OBJ_SELFTEST); + if (IS_ERR(dev_obj)) + return PTR_ERR(dev_obj); + + sobj = container_of(dev_obj, struct selftest_obj, obj); + if (sobj->type != TYPE_IDEV) { + rc = -EINVAL; + goto out_dev_obj; + } + + rc = iommufd_device_replace(sobj->idev.idev, &pt_id); + if (rc) + goto out_dev_obj; + + cmd->mock_domain_replace.pt_id = pt_id; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + +out_dev_obj: + iommufd_put_object(dev_obj); + return rc; +} + /* Add an additional reserved IOVA to the IOAS */ static int iommufd_test_add_reserved(struct iommufd_ucmd *ucmd, unsigned int mockpt_id, @@ -948,6 +984,9 @@ int iommufd_test(struct iommufd_ucmd *ucmd) cmd->add_reserved.length); case IOMMU_TEST_OP_MOCK_DOMAIN: return iommufd_test_mock_domain(ucmd, cmd); + case IOMMU_TEST_OP_MOCK_DOMAIN_REPLACE: + return iommufd_test_mock_domain_replace( + ucmd, cmd->id, cmd->mock_domain_replace.pt_id, cmd); case IOMMU_TEST_OP_MD_CHECK_MAP: return iommufd_test_md_check_pa( ucmd, cmd->id, cmd->check_map.iova, diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 9657c58813dc8..0ac60256b6592 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -23,6 +23,7 @@ struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx, void iommufd_device_unbind(struct iommufd_device *idev); int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id); +int iommufd_device_replace(struct iommufd_device *idev, u32 *pt_id); void iommufd_device_detach(struct iommufd_device *idev); struct iommufd_ctx *iommufd_device_to_ictx(struct iommufd_device *idev); diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index e4a6b33cfde48..96881ecca5c75 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -9,9 +9,6 @@ #include "iommufd_utils.h" -static void *buffer; - -static unsigned long PAGE_SIZE; static unsigned long HUGEPAGE_SIZE; #define MOCK_PAGE_SIZE (PAGE_SIZE / 2) @@ -1035,6 +1032,7 @@ FIXTURE(iommufd_mock_domain) uint32_t ioas_id; uint32_t hwpt_id; uint32_t hwpt_ids[2]; + uint32_t stdev_ids[2]; int mmap_flags; size_t mmap_buf_size; }; @@ -1056,7 +1054,8 @@ FIXTURE_SETUP(iommufd_mock_domain) ASSERT_GE(ARRAY_SIZE(self->hwpt_ids), variant->mock_domains); for (i = 0; i != variant->mock_domains; i++) - test_cmd_mock_domain(self->ioas_id, NULL, &self->hwpt_ids[i]); + test_cmd_mock_domain(self->ioas_id, &self->stdev_ids[i], + &self->hwpt_ids[i]); self->hwpt_id = self->hwpt_ids[0]; self->mmap_flags = MAP_SHARED | MAP_ANONYMOUS; @@ -1308,6 +1307,36 @@ TEST_F(iommufd_mock_domain, user_copy) test_ioctl_destroy(ioas_id); } +TEST_F(iommufd_mock_domain, replace) +{ + uint32_t ioas_id; + + test_ioctl_ioas_alloc(&ioas_id); + + test_cmd_mock_domain_replace(self->stdev_ids[0], ioas_id); + + /* + * Replacing the IOAS causes the prior HWPT to be deallocated, thus we + * should get enoent when we try to use it. + */ + if (variant->mock_domains == 1) + test_err_mock_domain_replace(ENOENT, self->stdev_ids[0], + self->hwpt_ids[0]); + + test_cmd_mock_domain_replace(self->stdev_ids[0], ioas_id); + if (variant->mock_domains >= 2) { + test_cmd_mock_domain_replace(self->stdev_ids[0], + self->hwpt_ids[1]); + test_cmd_mock_domain_replace(self->stdev_ids[0], + self->hwpt_ids[1]); + test_cmd_mock_domain_replace(self->stdev_ids[0], + self->hwpt_ids[0]); + } + + test_cmd_mock_domain_replace(self->stdev_ids[0], self->ioas_id); + test_ioctl_destroy(ioas_id); +} + /* VFIO compatibility IOCTLs */ TEST_F(iommufd, simple_ioctls) diff --git a/tools/testing/selftests/iommu/iommufd_fail_nth.c b/tools/testing/selftests/iommu/iommufd_fail_nth.c index d9afcb23810e1..96fb2f04623f2 100644 --- a/tools/testing/selftests/iommu/iommufd_fail_nth.c +++ b/tools/testing/selftests/iommu/iommufd_fail_nth.c @@ -41,6 +41,8 @@ static int writeat(int dfd, const char *fn, const char *val) static __attribute__((constructor)) void setup_buffer(void) { + PAGE_SIZE = sysconf(_SC_PAGE_SIZE); + BUFFER_SIZE = 2*1024*1024; buffer = mmap(0, BUFFER_SIZE, PROT_READ | PROT_WRITE, @@ -569,4 +571,44 @@ TEST_FAIL_NTH(basic_fail_nth, access_pin_domain) return 0; } +/* device.c */ +TEST_FAIL_NTH(basic_fail_nth, device) +{ + uint32_t ioas_id; + uint32_t ioas_id2; + uint32_t stdev_id; + __u64 iova; + + self->fd = open("/dev/iommu", O_RDWR); + if (self->fd == -1) + return -1; + + if (_test_ioctl_ioas_alloc(self->fd, &ioas_id)) + return -1; + + if (_test_ioctl_ioas_alloc(self->fd, &ioas_id2)) + return -1; + + iova = MOCK_APERTURE_START; + if (_test_ioctl_ioas_map(self->fd, ioas_id, buffer, PAGE_SIZE, &iova, + IOMMU_IOAS_MAP_FIXED_IOVA | + IOMMU_IOAS_MAP_WRITEABLE | + IOMMU_IOAS_MAP_READABLE)) + return -1; + if (_test_ioctl_ioas_map(self->fd, ioas_id2, buffer, PAGE_SIZE, &iova, + IOMMU_IOAS_MAP_FIXED_IOVA | + IOMMU_IOAS_MAP_WRITEABLE | + IOMMU_IOAS_MAP_READABLE)) + return -1; + + fail_nth_enable(); + + if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, NULL)) + return -1; + + if (_test_cmd_mock_domain_replace(self->fd, stdev_id, ioas_id2, NULL)) + return -1; + return 0; +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index 85d6662ef8e86..8b11bb7009526 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -19,6 +19,8 @@ static void *buffer; static unsigned long BUFFER_SIZE; +static unsigned long PAGE_SIZE; + /* * Have the kernel check the refcount on pages. I don't know why a freshly * mmap'd anon non-compound page starts out with a ref of 3 @@ -66,6 +68,34 @@ static int _test_cmd_mock_domain(int fd, unsigned int ioas_id, __u32 *stdev_id, EXPECT_ERRNO(_errno, _test_cmd_mock_domain(self->fd, ioas_id, \ stdev_id, hwpt_id)) +static int _test_cmd_mock_domain_replace(int fd, __u32 stdev_id, __u32 pt_id, + __u32 *hwpt_id) +{ + struct iommu_test_cmd cmd = { + .size = sizeof(cmd), + .op = IOMMU_TEST_OP_MOCK_DOMAIN_REPLACE, + .id = stdev_id, + .mock_domain_replace = { + .pt_id = pt_id, + }, + }; + int ret; + + ret = ioctl(fd, IOMMU_TEST_CMD, &cmd); + if (ret) + return ret; + if (hwpt_id) + *hwpt_id = cmd.mock_domain_replace.pt_id; + return 0; +} + +#define test_cmd_mock_domain_replace(stdev_id, pt_id) \ + ASSERT_EQ(0, _test_cmd_mock_domain_replace(self->fd, stdev_id, pt_id, \ + NULL)) +#define test_err_mock_domain_replace(_errno, stdev_id, pt_id) \ + EXPECT_ERRNO(_errno, _test_cmd_mock_domain_replace(self->fd, stdev_id, \ + pt_id, NULL)) + static int _test_cmd_create_access(int fd, unsigned int ioas_id, __u32 *access_id, unsigned int flags) { From 7074d7bd67d495cb3f6fe7c7c96b357a3b9d4ec2 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 17 Jul 2023 15:12:13 -0300 Subject: [PATCH 17/26] iommufd: Add IOMMU_HWPT_ALLOC This allows userspace to manually create HWPTs on IOAS's and then use those HWPTs as inputs to iommufd_device_attach/replace(). Following series will extend this to allow creating iommu_domains with driver specific parameters. Link: https://lore.kernel.org/r/17-v8-6659224517ea+532-iommufd_alloc_jgg@nvidia.com Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/hw_pagetable.c | 46 +++++++++++++++++++++++++ drivers/iommu/iommufd/iommufd_private.h | 9 +++++ drivers/iommu/iommufd/main.c | 3 ++ include/uapi/linux/iommufd.h | 26 ++++++++++++++ 4 files changed, 84 insertions(+) diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 2087b51d9807a..cf2c1504e20d8 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -3,6 +3,7 @@ * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES */ #include +#include #include "iommufd_private.h" @@ -131,3 +132,48 @@ iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, iommufd_object_abort_and_destroy(ictx, &hwpt->obj); return ERR_PTR(rc); } + +int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) +{ + struct iommu_hwpt_alloc *cmd = ucmd->cmd; + struct iommufd_hw_pagetable *hwpt; + struct iommufd_device *idev; + struct iommufd_ioas *ioas; + int rc; + + if (cmd->flags || cmd->__reserved) + return -EOPNOTSUPP; + + idev = iommufd_get_device(ucmd, cmd->dev_id); + if (IS_ERR(idev)) + return PTR_ERR(idev); + + ioas = iommufd_get_ioas(ucmd->ictx, cmd->pt_id); + if (IS_ERR(ioas)) { + rc = PTR_ERR(ioas); + goto out_put_idev; + } + + mutex_lock(&ioas->mutex); + hwpt = iommufd_hw_pagetable_alloc(ucmd->ictx, ioas, idev, false); + if (IS_ERR(hwpt)) { + rc = PTR_ERR(hwpt); + goto out_unlock; + } + + cmd->out_hwpt_id = hwpt->obj.id; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + if (rc) + goto out_hwpt; + iommufd_object_finalize(ucmd->ictx, &hwpt->obj); + goto out_unlock; + +out_hwpt: + iommufd_object_abort_and_destroy(ucmd->ictx, &hwpt->obj); +out_unlock: + mutex_unlock(&ioas->mutex); + iommufd_put_object(&ioas->obj); +out_put_idev: + iommufd_put_object(&idev->obj); + return rc; +} diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 9394f0e6692a9..dba730129b8c2 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -262,6 +262,7 @@ struct iommufd_hw_pagetable * iommufd_hw_pagetable_detach(struct iommufd_device *idev); void iommufd_hw_pagetable_destroy(struct iommufd_object *obj); void iommufd_hw_pagetable_abort(struct iommufd_object *obj); +int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd); static inline void iommufd_hw_pagetable_put(struct iommufd_ctx *ictx, struct iommufd_hw_pagetable *hwpt) @@ -298,6 +299,14 @@ struct iommufd_device { bool enforce_cache_coherency; }; +static inline struct iommufd_device * +iommufd_get_device(struct iommufd_ucmd *ucmd, u32 id) +{ + return container_of(iommufd_get_object(ucmd->ictx, id, + IOMMUFD_OBJ_DEVICE), + struct iommufd_device, obj); +} + void iommufd_device_destroy(struct iommufd_object *obj); struct iommufd_access { diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index 56cdf3c796a7e..94c498b8fdf68 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -265,6 +265,7 @@ static int iommufd_option(struct iommufd_ucmd *ucmd) union ucmd_buffer { struct iommu_destroy destroy; + struct iommu_hwpt_alloc hwpt; struct iommu_ioas_alloc alloc; struct iommu_ioas_allow_iovas allow_iovas; struct iommu_ioas_copy ioas_copy; @@ -296,6 +297,8 @@ struct iommufd_ioctl_op { } static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { IOCTL_OP(IOMMU_DESTROY, iommufd_destroy, struct iommu_destroy, id), + IOCTL_OP(IOMMU_HWPT_ALLOC, iommufd_hwpt_alloc, struct iommu_hwpt_alloc, + __reserved), IOCTL_OP(IOMMU_IOAS_ALLOC, iommufd_ioas_alloc_ioctl, struct iommu_ioas_alloc, out_ioas_id), IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas, diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 98ebba80cfa1f..8245c01adca67 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -45,6 +45,7 @@ enum { IOMMUFD_CMD_IOAS_UNMAP, IOMMUFD_CMD_OPTION, IOMMUFD_CMD_VFIO_IOAS, + IOMMUFD_CMD_HWPT_ALLOC, }; /** @@ -344,4 +345,29 @@ struct iommu_vfio_ioas { __u16 __reserved; }; #define IOMMU_VFIO_IOAS _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VFIO_IOAS) + +/** + * struct iommu_hwpt_alloc - ioctl(IOMMU_HWPT_ALLOC) + * @size: sizeof(struct iommu_hwpt_alloc) + * @flags: Must be 0 + * @dev_id: The device to allocate this HWPT for + * @pt_id: The IOAS to connect this HWPT to + * @out_hwpt_id: The ID of the new HWPT + * @__reserved: Must be 0 + * + * Explicitly allocate a hardware page table object. This is the same object + * type that is returned by iommufd_device_attach() and represents the + * underlying iommu driver's iommu_domain kernel object. + * + * A HWPT will be created with the IOVA mappings from the given IOAS. + */ +struct iommu_hwpt_alloc { + __u32 size; + __u32 flags; + __u32 dev_id; + __u32 pt_id; + __u32 out_hwpt_id; + __u32 __reserved; +}; +#define IOMMU_HWPT_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_ALLOC) #endif From 7a467e02b339a50ed2762edd72e763925ac2b0a3 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 17 Jul 2023 15:12:14 -0300 Subject: [PATCH 18/26] iommufd/selftest: Return the real idev id from selftest mock_domain Now that we actually call iommufd_device_bind() we can return the idev_id from that function to userspace for use in other APIs. Link: https://lore.kernel.org/r/18-v8-6659224517ea+532-iommufd_alloc_jgg@nvidia.com Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/iommufd_test.h | 2 ++ drivers/iommu/iommufd/selftest.c | 1 + tools/testing/selftests/iommu/iommufd.c | 17 +++++++++-------- .../testing/selftests/iommu/iommufd_fail_nth.c | 18 ++++++++++-------- tools/testing/selftests/iommu/iommufd_utils.h | 12 +++++++----- 5 files changed, 29 insertions(+), 21 deletions(-) diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h index e3f1035cbd646..dd9168a20ddf4 100644 --- a/drivers/iommu/iommufd/iommufd_test.h +++ b/drivers/iommu/iommufd/iommufd_test.h @@ -52,6 +52,8 @@ struct iommu_test_cmd { struct { __u32 out_stdev_id; __u32 out_hwpt_id; + /* out_idev_id is the standard iommufd_bind object */ + __u32 out_idev_id; } mock_domain; struct { __u32 pt_id; diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index eb33dffb955dc..9d43334e4faf1 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -443,6 +443,7 @@ static int iommufd_test_mock_domain(struct iommufd_ucmd *ucmd, /* Userspace must destroy the device_id to destroy the object */ cmd->mock_domain.out_hwpt_id = pt_id; cmd->mock_domain.out_stdev_id = sobj->obj.id; + cmd->mock_domain.out_idev_id = idev_id; iommufd_object_finalize(ucmd->ictx, &sobj->obj); return iommufd_ucmd_respond(ucmd, sizeof(*cmd)); diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index 96881ecca5c75..78841aba47d03 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -211,7 +211,7 @@ FIXTURE_SETUP(iommufd_ioas) for (i = 0; i != variant->mock_domains; i++) { test_cmd_mock_domain(self->ioas_id, &self->stdev_id, - &self->hwpt_id); + &self->hwpt_id, NULL); self->base_iova = MOCK_APERTURE_START; } } @@ -262,7 +262,7 @@ TEST_F(iommufd_ioas, hwpt_attach) { /* Create a device attached directly to a hwpt */ if (self->stdev_id) { - test_cmd_mock_domain(self->hwpt_id, NULL, NULL); + test_cmd_mock_domain(self->hwpt_id, NULL, NULL, NULL); } else { test_err_mock_domain(ENOENT, self->hwpt_id, NULL, NULL); } @@ -681,7 +681,7 @@ TEST_F(iommufd_ioas, access_pin) _IOMMU_TEST_CMD(IOMMU_TEST_OP_ACCESS_PAGES), &access_cmd)); test_cmd_mock_domain(self->ioas_id, &mock_stdev_id, - &mock_hwpt_id); + &mock_hwpt_id, NULL); check_map_cmd.id = mock_hwpt_id; ASSERT_EQ(0, ioctl(self->fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_MD_CHECK_MAP), @@ -836,7 +836,7 @@ TEST_F(iommufd_ioas, fork_gone) * If a domain already existed then everything was pinned within * the fork, so this copies from one domain to another. */ - test_cmd_mock_domain(self->ioas_id, NULL, NULL); + test_cmd_mock_domain(self->ioas_id, NULL, NULL, NULL); check_access_rw(_metadata, self->fd, access_id, MOCK_APERTURE_START, 0); @@ -885,7 +885,7 @@ TEST_F(iommufd_ioas, fork_present) ASSERT_EQ(8, read(efd, &tmp, sizeof(tmp))); /* Read pages from the remote process */ - test_cmd_mock_domain(self->ioas_id, NULL, NULL); + test_cmd_mock_domain(self->ioas_id, NULL, NULL, NULL); check_access_rw(_metadata, self->fd, access_id, MOCK_APERTURE_START, 0); ASSERT_EQ(0, close(pipefds[1])); @@ -1033,6 +1033,7 @@ FIXTURE(iommufd_mock_domain) uint32_t hwpt_id; uint32_t hwpt_ids[2]; uint32_t stdev_ids[2]; + uint32_t idev_ids[2]; int mmap_flags; size_t mmap_buf_size; }; @@ -1055,7 +1056,7 @@ FIXTURE_SETUP(iommufd_mock_domain) for (i = 0; i != variant->mock_domains; i++) test_cmd_mock_domain(self->ioas_id, &self->stdev_ids[i], - &self->hwpt_ids[i]); + &self->hwpt_ids[i], &self->idev_ids[i]); self->hwpt_id = self->hwpt_ids[0]; self->mmap_flags = MAP_SHARED | MAP_ANONYMOUS; @@ -1249,7 +1250,7 @@ TEST_F(iommufd_mock_domain, all_aligns_copy) /* Add and destroy a domain while the area exists */ old_id = self->hwpt_ids[1]; test_cmd_mock_domain(self->ioas_id, &mock_stdev_id, - &self->hwpt_ids[1]); + &self->hwpt_ids[1], NULL); check_mock_iova(buf + start, iova, length); check_refs(buf + start / PAGE_SIZE * PAGE_SIZE, @@ -1458,7 +1459,7 @@ FIXTURE_SETUP(vfio_compat_mock_domain) /* Create what VFIO would consider a group */ test_ioctl_ioas_alloc(&self->ioas_id); - test_cmd_mock_domain(self->ioas_id, NULL, NULL); + test_cmd_mock_domain(self->ioas_id, NULL, NULL, NULL); /* Attach it to the vfio compat */ vfio_ioas_cmd.ioas_id = self->ioas_id; diff --git a/tools/testing/selftests/iommu/iommufd_fail_nth.c b/tools/testing/selftests/iommu/iommufd_fail_nth.c index 96fb2f04623f2..8ab20df4edc8c 100644 --- a/tools/testing/selftests/iommu/iommufd_fail_nth.c +++ b/tools/testing/selftests/iommu/iommufd_fail_nth.c @@ -315,7 +315,7 @@ TEST_FAIL_NTH(basic_fail_nth, map_domain) fail_nth_enable(); - if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id)) + if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id, NULL)) return -1; if (_test_ioctl_ioas_map(self->fd, ioas_id, buffer, 262144, &iova, @@ -326,7 +326,7 @@ TEST_FAIL_NTH(basic_fail_nth, map_domain) if (_test_ioctl_destroy(self->fd, stdev_id)) return -1; - if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id)) + if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id, NULL)) return -1; return 0; } @@ -350,12 +350,13 @@ TEST_FAIL_NTH(basic_fail_nth, map_two_domains) if (_test_ioctl_set_temp_memory_limit(self->fd, 32)) return -1; - if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id)) + if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id, NULL)) return -1; fail_nth_enable(); - if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id2, &hwpt_id2)) + if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id2, &hwpt_id2, + NULL)) return -1; if (_test_ioctl_ioas_map(self->fd, ioas_id, buffer, 262144, &iova, @@ -369,9 +370,10 @@ TEST_FAIL_NTH(basic_fail_nth, map_two_domains) if (_test_ioctl_destroy(self->fd, stdev_id2)) return -1; - if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id)) + if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id, NULL)) return -1; - if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id2, &hwpt_id2)) + if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id2, &hwpt_id2, + NULL)) return -1; return 0; } @@ -528,7 +530,7 @@ TEST_FAIL_NTH(basic_fail_nth, access_pin_domain) if (_test_ioctl_set_temp_memory_limit(self->fd, 32)) return -1; - if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id)) + if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id, NULL)) return -1; if (_test_ioctl_ioas_map(self->fd, ioas_id, buffer, BUFFER_SIZE, &iova, @@ -603,7 +605,7 @@ TEST_FAIL_NTH(basic_fail_nth, device) fail_nth_enable(); - if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, NULL)) + if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, NULL, NULL)) return -1; if (_test_cmd_mock_domain_replace(self->fd, stdev_id, ioas_id2, NULL)) diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index 8b11bb7009526..62e01412a7679 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -41,7 +41,7 @@ static unsigned long PAGE_SIZE; }) static int _test_cmd_mock_domain(int fd, unsigned int ioas_id, __u32 *stdev_id, - __u32 *hwpt_id) + __u32 *hwpt_id, __u32 *idev_id) { struct iommu_test_cmd cmd = { .size = sizeof(cmd), @@ -59,14 +59,16 @@ static int _test_cmd_mock_domain(int fd, unsigned int ioas_id, __u32 *stdev_id, assert(cmd.id != 0); if (hwpt_id) *hwpt_id = cmd.mock_domain.out_hwpt_id; + if (idev_id) + *idev_id = cmd.mock_domain.out_idev_id; return 0; } -#define test_cmd_mock_domain(ioas_id, stdev_id, hwpt_id) \ - ASSERT_EQ(0, \ - _test_cmd_mock_domain(self->fd, ioas_id, stdev_id, hwpt_id)) +#define test_cmd_mock_domain(ioas_id, stdev_id, hwpt_id, idev_id) \ + ASSERT_EQ(0, _test_cmd_mock_domain(self->fd, ioas_id, stdev_id, \ + hwpt_id, idev_id)) #define test_err_mock_domain(_errno, ioas_id, stdev_id, hwpt_id) \ EXPECT_ERRNO(_errno, _test_cmd_mock_domain(self->fd, ioas_id, \ - stdev_id, hwpt_id)) + stdev_id, hwpt_id, NULL)) static int _test_cmd_mock_domain_replace(int fd, __u32 stdev_id, __u32 pt_id, __u32 *hwpt_id) From 6583c865dec5493beb4c72fd724eb9d43d8d7ebb Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 17 Jul 2023 15:12:15 -0300 Subject: [PATCH 19/26] iommufd/selftest: Add a selftest for IOMMU_HWPT_ALLOC Test the basic flow. Link: https://lore.kernel.org/r/19-v8-6659224517ea+532-iommufd_alloc_jgg@nvidia.com Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/io_pagetable.c | 3 +++ tools/testing/selftests/iommu/iommufd.c | 15 +++++++++++++ .../selftests/iommu/iommufd_fail_nth.c | 11 +++++++++- tools/testing/selftests/iommu/iommufd_utils.h | 21 +++++++++++++++++++ 4 files changed, 49 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c index 112bea5a84e43..4d095115c2d0b 100644 --- a/drivers/iommu/iommufd/io_pagetable.c +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -1180,6 +1180,9 @@ int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt, unsigned int num_sw_msi = 0; int rc; + if (iommufd_should_fail()) + return -EINVAL; + down_write(&iopt->iova_rwsem); /* FIXME: drivers allocate memory but there is no failure propogated */ iommu_get_resv_regions(dev, &resv_regions); diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index 78841aba47d03..dc09c1de319f0 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -1338,6 +1338,21 @@ TEST_F(iommufd_mock_domain, replace) test_ioctl_destroy(ioas_id); } +TEST_F(iommufd_mock_domain, alloc_hwpt) +{ + int i; + + for (i = 0; i != variant->mock_domains; i++) { + uint32_t stddev_id; + uint32_t hwpt_id; + + test_cmd_hwpt_alloc(self->idev_ids[0], self->ioas_id, &hwpt_id); + test_cmd_mock_domain(hwpt_id, &stddev_id, NULL, NULL); + test_ioctl_destroy(stddev_id); + test_ioctl_destroy(hwpt_id); + } +} + /* VFIO compatibility IOCTLs */ TEST_F(iommufd, simple_ioctls) diff --git a/tools/testing/selftests/iommu/iommufd_fail_nth.c b/tools/testing/selftests/iommu/iommufd_fail_nth.c index 8ab20df4edc8c..d4c552e569481 100644 --- a/tools/testing/selftests/iommu/iommufd_fail_nth.c +++ b/tools/testing/selftests/iommu/iommufd_fail_nth.c @@ -579,6 +579,8 @@ TEST_FAIL_NTH(basic_fail_nth, device) uint32_t ioas_id; uint32_t ioas_id2; uint32_t stdev_id; + uint32_t idev_id; + uint32_t hwpt_id; __u64 iova; self->fd = open("/dev/iommu", O_RDWR); @@ -605,11 +607,18 @@ TEST_FAIL_NTH(basic_fail_nth, device) fail_nth_enable(); - if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, NULL, NULL)) + if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, NULL, + &idev_id)) + return -1; + + if (_test_cmd_hwpt_alloc(self->fd, idev_id, ioas_id, &hwpt_id)) return -1; if (_test_cmd_mock_domain_replace(self->fd, stdev_id, ioas_id2, NULL)) return -1; + + if (_test_cmd_mock_domain_replace(self->fd, stdev_id, hwpt_id, NULL)) + return -1; return 0; } diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index 62e01412a7679..53b4d3f2d9fc6 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -98,6 +98,27 @@ static int _test_cmd_mock_domain_replace(int fd, __u32 stdev_id, __u32 pt_id, EXPECT_ERRNO(_errno, _test_cmd_mock_domain_replace(self->fd, stdev_id, \ pt_id, NULL)) +static int _test_cmd_hwpt_alloc(int fd, __u32 device_id, __u32 pt_id, + __u32 *hwpt_id) +{ + struct iommu_hwpt_alloc cmd = { + .size = sizeof(cmd), + .dev_id = device_id, + .pt_id = pt_id, + }; + int ret; + + ret = ioctl(fd, IOMMU_HWPT_ALLOC, &cmd); + if (ret) + return ret; + if (hwpt_id) + *hwpt_id = cmd.out_hwpt_id; + return 0; +} + +#define test_cmd_hwpt_alloc(device_id, pt_id, hwpt_id) \ + ASSERT_EQ(0, _test_cmd_hwpt_alloc(self->fd, device_id, pt_id, hwpt_id)) + static int _test_cmd_create_access(int fd, unsigned int ioas_id, __u32 *access_id, unsigned int flags) { From 89e07fd4680985351559cdc916098ad72e034bfc Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Thu, 27 Jul 2023 23:33:23 -0700 Subject: [PATCH 20/26] vfio: Do not allow !ops->dma_unmap in vfio_pin/unpin_pages() A driver that doesn't implement ops->dma_unmap shouldn't be allowed to do vfio_pin/unpin_pages(), though it can use vfio_dma_rw() to access an iova range. Deny !ops->dma_unmap cases in vfio_pin/unpin_pages(). Link: https://lore.kernel.org/r/85d622729d8f2334b35d42f1c568df1ededb9171.1690523699.git.nicolinc@nvidia.com Suggested-by: Kevin Tian Reviewed-by: Kevin Tian Reviewed-by: Yi Liu Reviewed-by: Jason Gunthorpe Reviewed-by: Alex Williamson Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/vfio/vfio_main.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 902f06e52c483..0da8ed81a97d2 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1483,6 +1483,8 @@ int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova, /* group->container cannot change while a vfio device is open */ if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device))) return -EINVAL; + if (!device->ops->dma_unmap) + return -EINVAL; if (vfio_device_has_container(device)) return vfio_device_container_pin_pages(device, iova, npage, prot, pages); @@ -1520,6 +1522,8 @@ void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage) { if (WARN_ON(!vfio_assert_device_open(device))) return; + if (WARN_ON(!device->ops->dma_unmap)) + return; if (vfio_device_has_container(device)) { vfio_device_container_unpin_pages(device, iova, npage); From 5d5c85ff6246c1b70b3b75a0b9d9fe749d0a5652 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Thu, 27 Jul 2023 23:33:24 -0700 Subject: [PATCH 21/26] iommufd: Allow passing in iopt_access_list_id to iopt_remove_access() This is a preparatory change for ioas replacement support for accesses. The replacement routine does an iopt_add_access() for a new IOAS first and then iopt_remove_access() for the old IOAS upon the success of the first call. However, the first call overrides the iopt_access_list_id in the access struct, resulting in iopt_remove_access() being unable to work on the old IOAS. Add an iopt_access_list_id as a parameter to iopt_remove_access, so the replacement routine can save the id before it gets overwritten. Pass the id in iopt_remove_access() for a proper cleanup. The existing callers should just pass in access->iopt_access_list_id. Link: https://lore.kernel.org/r/7bb939b9e0102da0c099572bb3de78ab7622221e.1690523699.git.nicolinc@nvidia.com Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 6 ++++-- drivers/iommu/iommufd/io_pagetable.c | 6 +++--- drivers/iommu/iommufd/iommufd_private.h | 3 ++- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 57c0e81f5073b..7a3e8660b902f 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -690,7 +690,8 @@ void iommufd_access_destroy_object(struct iommufd_object *obj) container_of(obj, struct iommufd_access, obj); if (access->ioas) { - iopt_remove_access(&access->ioas->iopt, access); + iopt_remove_access(&access->ioas->iopt, access, + access->iopt_access_list_id); refcount_dec(&access->ioas->obj.users); access->ioas = NULL; } @@ -776,7 +777,8 @@ void iommufd_access_detach(struct iommufd_access *access) access->ops->unmap(access->data, 0, ULONG_MAX); mutex_lock(&access->ioas_lock); } - iopt_remove_access(&cur_ioas->iopt, access); + iopt_remove_access(&cur_ioas->iopt, access, + access->iopt_access_list_id); refcount_dec(&cur_ioas->obj.users); out: access->ioas_unpin = NULL; diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c index 4d095115c2d0b..3a598182b7619 100644 --- a/drivers/iommu/iommufd/io_pagetable.c +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -1158,12 +1158,12 @@ int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access) } void iopt_remove_access(struct io_pagetable *iopt, - struct iommufd_access *access) + struct iommufd_access *access, + u32 iopt_access_list_id) { down_write(&iopt->domains_rwsem); down_write(&iopt->iova_rwsem); - WARN_ON(xa_erase(&iopt->access_list, access->iopt_access_list_id) != - access); + WARN_ON(xa_erase(&iopt->access_list, iopt_access_list_id) != access); WARN_ON(iopt_calculate_iova_alignment(iopt)); up_write(&iopt->iova_rwsem); up_write(&iopt->domains_rwsem); diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index dba730129b8c2..8ba786bc95ff8 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -323,7 +323,8 @@ struct iommufd_access { int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access); void iopt_remove_access(struct io_pagetable *iopt, - struct iommufd_access *access); + struct iommufd_access *access, + u32 iopt_access_list_id); void iommufd_access_destroy_object(struct iommufd_object *obj); #ifdef CONFIG_IOMMUFD_TEST From 9227da7816dd1a42e20d41e2244cb63c205477ca Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Thu, 27 Jul 2023 23:33:25 -0700 Subject: [PATCH 22/26] iommufd: Add iommufd_access_change_ioas(_id) helpers The complication of the mutex and refcount will be amplified after we introduce the replace support for accesses. So, add a preparatory change of a constitutive helper iommufd_access_change_ioas() and its wrapper iommufd_access_change_ioas_id(). They can simply take care of existing iommufd_access_attach() and iommufd_access_detach(), properly sequencing the refcount puts so that they are truely at the end of the sequence after we know the IOAS pointer is not required any more. Link: https://lore.kernel.org/r/da0c462532193b447329c4eb975a596f47e49b70.1690523699.git.nicolinc@nvidia.com Suggested-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 109 +++++++++++++++++++++------------ 1 file changed, 71 insertions(+), 38 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 7a3e8660b902f..e5c408415e95e 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -684,6 +684,70 @@ void iommufd_device_detach(struct iommufd_device *idev) } EXPORT_SYMBOL_NS_GPL(iommufd_device_detach, IOMMUFD); +/* + * On success, it will refcount_inc() at a valid new_ioas and refcount_dec() at + * a valid cur_ioas (access->ioas). A caller passing in a valid new_ioas should + * call iommufd_put_object() if it does an iommufd_get_object() for a new_ioas. + */ +static int iommufd_access_change_ioas(struct iommufd_access *access, + struct iommufd_ioas *new_ioas) +{ + u32 iopt_access_list_id = access->iopt_access_list_id; + struct iommufd_ioas *cur_ioas = access->ioas; + int rc; + + lockdep_assert_held(&access->ioas_lock); + + /* We are racing with a concurrent detach, bail */ + if (cur_ioas != access->ioas_unpin) + return -EBUSY; + + if (cur_ioas == new_ioas) + return 0; + + /* + * Set ioas to NULL to block any further iommufd_access_pin_pages(). + * iommufd_access_unpin_pages() can continue using access->ioas_unpin. + */ + access->ioas = NULL; + + if (new_ioas) { + rc = iopt_add_access(&new_ioas->iopt, access); + if (rc) { + access->ioas = cur_ioas; + return rc; + } + refcount_inc(&new_ioas->obj.users); + } + + if (cur_ioas) { + if (access->ops->unmap) { + mutex_unlock(&access->ioas_lock); + access->ops->unmap(access->data, 0, ULONG_MAX); + mutex_lock(&access->ioas_lock); + } + iopt_remove_access(&cur_ioas->iopt, access, iopt_access_list_id); + refcount_dec(&cur_ioas->obj.users); + } + + access->ioas = new_ioas; + access->ioas_unpin = new_ioas; + + return 0; +} + +static int iommufd_access_change_ioas_id(struct iommufd_access *access, u32 id) +{ + struct iommufd_ioas *ioas = iommufd_get_ioas(access->ictx, id); + int rc; + + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + rc = iommufd_access_change_ioas(access, ioas); + iommufd_put_object(&ioas->obj); + return rc; +} + void iommufd_access_destroy_object(struct iommufd_object *obj) { struct iommufd_access *access = @@ -761,60 +825,29 @@ EXPORT_SYMBOL_NS_GPL(iommufd_access_destroy, IOMMUFD); void iommufd_access_detach(struct iommufd_access *access) { - struct iommufd_ioas *cur_ioas = access->ioas; - mutex_lock(&access->ioas_lock); - if (WARN_ON(!access->ioas)) - goto out; - /* - * Set ioas to NULL to block any further iommufd_access_pin_pages(). - * iommufd_access_unpin_pages() can continue using access->ioas_unpin. - */ - access->ioas = NULL; - - if (access->ops->unmap) { + if (WARN_ON(!access->ioas)) { mutex_unlock(&access->ioas_lock); - access->ops->unmap(access->data, 0, ULONG_MAX); - mutex_lock(&access->ioas_lock); + return; } - iopt_remove_access(&cur_ioas->iopt, access, - access->iopt_access_list_id); - refcount_dec(&cur_ioas->obj.users); -out: - access->ioas_unpin = NULL; + WARN_ON(iommufd_access_change_ioas(access, NULL)); mutex_unlock(&access->ioas_lock); } EXPORT_SYMBOL_NS_GPL(iommufd_access_detach, IOMMUFD); int iommufd_access_attach(struct iommufd_access *access, u32 ioas_id) { - struct iommufd_ioas *new_ioas; - int rc = 0; + int rc; mutex_lock(&access->ioas_lock); - if (WARN_ON(access->ioas || access->ioas_unpin)) { + if (WARN_ON(access->ioas)) { mutex_unlock(&access->ioas_lock); return -EINVAL; } - new_ioas = iommufd_get_ioas(access->ictx, ioas_id); - if (IS_ERR(new_ioas)) { - mutex_unlock(&access->ioas_lock); - return PTR_ERR(new_ioas); - } - - rc = iopt_add_access(&new_ioas->iopt, access); - if (rc) { - mutex_unlock(&access->ioas_lock); - iommufd_put_object(&new_ioas->obj); - return rc; - } - iommufd_ref_to_users(&new_ioas->obj); - - access->ioas = new_ioas; - access->ioas_unpin = new_ioas; + rc = iommufd_access_change_ioas_id(access, ioas_id); mutex_unlock(&access->ioas_lock); - return 0; + return rc; } EXPORT_SYMBOL_NS_GPL(iommufd_access_attach, IOMMUFD); From 6129b59fcdf374b5d82e1f4518884da13de38b1a Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Thu, 27 Jul 2023 23:33:26 -0700 Subject: [PATCH 23/26] iommufd: Use iommufd_access_change_ioas in iommufd_access_destroy_object Update iommufd_access_destroy_object() to call the new iommufd_access_change_ioas() helper. It is impossible to legitimately race iommufd_access_destroy_object() with iommufd_access_change_ioas() as iommufd_access_destroy_object() is only called once the refcount reache zero, so any concurrent iommufd_access_change_ioas() is already UAFing the memory. Link: https://lore.kernel.org/r/f9fbeca2cde7f8515da18d689b3e02a6a40a5e14.1690523699.git.nicolinc@nvidia.com Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index e5c408415e95e..c0b9cd97ec58d 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -753,12 +753,10 @@ void iommufd_access_destroy_object(struct iommufd_object *obj) struct iommufd_access *access = container_of(obj, struct iommufd_access, obj); - if (access->ioas) { - iopt_remove_access(&access->ioas->iopt, access, - access->iopt_access_list_id); - refcount_dec(&access->ioas->obj.users); - access->ioas = NULL; - } + mutex_lock(&access->ioas_lock); + if (access->ioas) + WARN_ON(iommufd_access_change_ioas(access, NULL)); + mutex_unlock(&access->ioas_lock); iommufd_ctx_put(access->ictx); } From 70c16123d865046899c36e3655b2d611f38f51a3 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Thu, 27 Jul 2023 23:33:27 -0700 Subject: [PATCH 24/26] iommufd: Add iommufd_access_replace() API Taking advantage of the new iommufd_access_change_ioas_id helper, add an iommufd_access_replace() API for the VFIO emulated pathway to use. Link: https://lore.kernel.org/r/a3267b924fd5f45e0d3a1dd13a9237e923563862.1690523699.git.nicolinc@nvidia.com Suggested-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 15 +++++++++++++++ include/linux/iommufd.h | 1 + 2 files changed, 16 insertions(+) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index c0b9cd97ec58d..4c37eeea2bcda 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -849,6 +849,21 @@ int iommufd_access_attach(struct iommufd_access *access, u32 ioas_id) } EXPORT_SYMBOL_NS_GPL(iommufd_access_attach, IOMMUFD); +int iommufd_access_replace(struct iommufd_access *access, u32 ioas_id) +{ + int rc; + + mutex_lock(&access->ioas_lock); + if (!access->ioas) { + mutex_unlock(&access->ioas_lock); + return -ENOENT; + } + rc = iommufd_access_change_ioas_id(access, ioas_id); + mutex_unlock(&access->ioas_lock); + return rc; +} +EXPORT_SYMBOL_NS_GPL(iommufd_access_replace, IOMMUFD); + /** * iommufd_access_notify_unmap - Notify users of an iopt to stop using it * @iopt: iopt to work on diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 0ac60256b6592..ffc3a949f8374 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -49,6 +49,7 @@ iommufd_access_create(struct iommufd_ctx *ictx, const struct iommufd_access_ops *ops, void *data, u32 *id); void iommufd_access_destroy(struct iommufd_access *access); int iommufd_access_attach(struct iommufd_access *access, u32 ioas_id); +int iommufd_access_replace(struct iommufd_access *access, u32 ioas_id); void iommufd_access_detach(struct iommufd_access *access); void iommufd_ctx_get(struct iommufd_ctx *ictx); From c154660b6e26c3f0670a49f43f1fafa5d65f6d39 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Thu, 27 Jul 2023 23:33:28 -0700 Subject: [PATCH 25/26] iommufd/selftest: Add IOMMU_TEST_OP_ACCESS_REPLACE_IOAS coverage Add a new IOMMU_TEST_OP_ACCESS_REPLACE_IOAS to allow replacing the access->ioas, corresponding to the iommufd_access_replace() helper. Then add replace coverage as a part of user_copy test case, which basically repeats the copy test after replacing the old ioas with a new one. Link: https://lore.kernel.org/r/a4897f93d41c34b972213243b8dbf4c3832842e4.1690523699.git.nicolinc@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/iommufd_test.h | 4 +++ drivers/iommu/iommufd/selftest.c | 19 ++++++++++++ tools/testing/selftests/iommu/iommufd.c | 29 +++++++++++++++++-- tools/testing/selftests/iommu/iommufd_utils.h | 19 ++++++++++++ 4 files changed, 69 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h index dd9168a20ddf4..258de2253b61d 100644 --- a/drivers/iommu/iommufd/iommufd_test.h +++ b/drivers/iommu/iommufd/iommufd_test.h @@ -18,6 +18,7 @@ enum { IOMMU_TEST_OP_ACCESS_RW, IOMMU_TEST_OP_SET_TEMP_MEMORY_LIMIT, IOMMU_TEST_OP_MOCK_DOMAIN_REPLACE, + IOMMU_TEST_OP_ACCESS_REPLACE_IOAS, }; enum { @@ -91,6 +92,9 @@ struct iommu_test_cmd { struct { __u32 limit; } memory_limit; + struct { + __u32 ioas_id; + } access_replace_ioas; }; __u32 last; }; diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 9d43334e4faf1..bb2cd54ca7b6b 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -785,6 +785,22 @@ static int iommufd_test_create_access(struct iommufd_ucmd *ucmd, return rc; } +static int iommufd_test_access_replace_ioas(struct iommufd_ucmd *ucmd, + unsigned int access_id, + unsigned int ioas_id) +{ + struct selftest_access *staccess; + int rc; + + staccess = iommufd_access_get(access_id); + if (IS_ERR(staccess)) + return PTR_ERR(staccess); + + rc = iommufd_access_replace(staccess->access, ioas_id); + fput(staccess->file); + return rc; +} + /* Check that the pages in a page array match the pages in the user VA */ static int iommufd_test_check_pages(void __user *uptr, struct page **pages, size_t npages) @@ -1000,6 +1016,9 @@ int iommufd_test(struct iommufd_ucmd *ucmd) case IOMMU_TEST_OP_CREATE_ACCESS: return iommufd_test_create_access(ucmd, cmd->id, cmd->create_access.flags); + case IOMMU_TEST_OP_ACCESS_REPLACE_IOAS: + return iommufd_test_access_replace_ioas( + ucmd, cmd->id, cmd->access_replace_ioas.ioas_id); case IOMMU_TEST_OP_ACCESS_PAGES: return iommufd_test_access_pages( ucmd, cmd->id, cmd->access_pages.iova, diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index dc09c1de319f0..8acd0af37aa5e 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -1283,7 +1283,13 @@ TEST_F(iommufd_mock_domain, user_copy) .dst_iova = MOCK_APERTURE_START, .length = BUFFER_SIZE, }; - unsigned int ioas_id; + struct iommu_ioas_unmap unmap_cmd = { + .size = sizeof(unmap_cmd), + .ioas_id = self->ioas_id, + .iova = MOCK_APERTURE_START, + .length = BUFFER_SIZE, + }; + unsigned int new_ioas_id, ioas_id; /* Pin the pages in an IOAS with no domains then copy to an IOAS with domains */ test_ioctl_ioas_alloc(&ioas_id); @@ -1301,11 +1307,30 @@ TEST_F(iommufd_mock_domain, user_copy) ASSERT_EQ(0, ioctl(self->fd, IOMMU_IOAS_COPY, ©_cmd)); check_mock_iova(buffer, MOCK_APERTURE_START, BUFFER_SIZE); + /* Now replace the ioas with a new one */ + test_ioctl_ioas_alloc(&new_ioas_id); + test_ioctl_ioas_map_id(new_ioas_id, buffer, BUFFER_SIZE, + ©_cmd.src_iova); + test_cmd_access_replace_ioas(access_cmd.id, new_ioas_id); + + /* Destroy the old ioas and cleanup copied mapping */ + ASSERT_EQ(0, ioctl(self->fd, IOMMU_IOAS_UNMAP, &unmap_cmd)); + test_ioctl_destroy(ioas_id); + + /* Then run the same test again with the new ioas */ + access_cmd.access_pages.iova = copy_cmd.src_iova; + ASSERT_EQ(0, + ioctl(self->fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_ACCESS_PAGES), + &access_cmd)); + copy_cmd.src_ioas_id = new_ioas_id; + ASSERT_EQ(0, ioctl(self->fd, IOMMU_IOAS_COPY, ©_cmd)); + check_mock_iova(buffer, MOCK_APERTURE_START, BUFFER_SIZE); + test_cmd_destroy_access_pages( access_cmd.id, access_cmd.access_pages.out_access_pages_id); test_cmd_destroy_access(access_cmd.id); - test_ioctl_destroy(ioas_id); + test_ioctl_destroy(new_ioas_id); } TEST_F(iommufd_mock_domain, replace) diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index 53b4d3f2d9fc6..70353e68e5997 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -119,6 +119,25 @@ static int _test_cmd_hwpt_alloc(int fd, __u32 device_id, __u32 pt_id, #define test_cmd_hwpt_alloc(device_id, pt_id, hwpt_id) \ ASSERT_EQ(0, _test_cmd_hwpt_alloc(self->fd, device_id, pt_id, hwpt_id)) +static int _test_cmd_access_replace_ioas(int fd, __u32 access_id, + unsigned int ioas_id) +{ + struct iommu_test_cmd cmd = { + .size = sizeof(cmd), + .op = IOMMU_TEST_OP_ACCESS_REPLACE_IOAS, + .id = access_id, + .access_replace_ioas = { .ioas_id = ioas_id }, + }; + int ret; + + ret = ioctl(fd, IOMMU_TEST_CMD, &cmd); + if (ret) + return ret; + return 0; +} +#define test_cmd_access_replace_ioas(access_id, ioas_id) \ + ASSERT_EQ(0, _test_cmd_access_replace_ioas(self->fd, access_id, ioas_id)) + static int _test_cmd_create_access(int fd, unsigned int ioas_id, __u32 *access_id, unsigned int flags) { From c157fd88619946cd62af47c45214c479749dad8d Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Thu, 27 Jul 2023 23:33:29 -0700 Subject: [PATCH 26/26] vfio: Support IO page table replacement Now both the physical path and the emulated path can support an IO page table replacement. Call iommufd_device_replace/iommufd_access_replace(), when vdev->iommufd_attached is true. Also update the VFIO_DEVICE_ATTACH_IOMMUFD_PT kdoc in the uAPI header. Link: https://lore.kernel.org/r/b5f01956ff161f76aa52c95b0fa1ad6eaca95c4a.1690523699.git.nicolinc@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Alex Williamson Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/vfio/iommufd.c | 11 ++++++----- include/uapi/linux/vfio.h | 6 ++++++ 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/vfio/iommufd.c b/drivers/vfio/iommufd.c index 4d84904fd927a..82eba6966fa50 100644 --- a/drivers/vfio/iommufd.c +++ b/drivers/vfio/iommufd.c @@ -146,9 +146,9 @@ int vfio_iommufd_physical_attach_ioas(struct vfio_device *vdev, u32 *pt_id) return -EINVAL; if (vdev->iommufd_attached) - return -EBUSY; - - rc = iommufd_device_attach(vdev->iommufd_device, pt_id); + rc = iommufd_device_replace(vdev->iommufd_device, pt_id); + else + rc = iommufd_device_attach(vdev->iommufd_device, pt_id); if (rc) return rc; vdev->iommufd_attached = true; @@ -223,8 +223,9 @@ int vfio_iommufd_emulated_attach_ioas(struct vfio_device *vdev, u32 *pt_id) lockdep_assert_held(&vdev->dev_set->lock); if (vdev->iommufd_attached) - return -EBUSY; - rc = iommufd_access_attach(vdev->iommufd_access, *pt_id); + rc = iommufd_access_replace(vdev->iommufd_access, *pt_id); + else + rc = iommufd_access_attach(vdev->iommufd_access, *pt_id); if (rc) return rc; vdev->iommufd_attached = true; diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index fa06e3eb49551..537157ff8670d 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -939,6 +939,12 @@ struct vfio_device_bind_iommufd { * Undo by VFIO_DEVICE_DETACH_IOMMUFD_PT or device fd close. This is only * allowed on cdev fds. * + * If a vfio device is currently attached to a valid hw_pagetable, without doing + * a VFIO_DEVICE_DETACH_IOMMUFD_PT, a second VFIO_DEVICE_ATTACH_IOMMUFD_PT ioctl + * passing in another hw_pagetable (hwpt) id is allowed. This action, also known + * as a hw_pagetable replacement, will replace the device's currently attached + * hw_pagetable with a new hw_pagetable corresponding to the given pt_id. + * * Return: 0 on success, -errno on failure. */ struct vfio_device_attach_iommufd_pt {