From fb21b1568adaa76af7a8c853f37c60fba8b28661 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Mon, 3 Feb 2025 21:00:54 -0800 Subject: [PATCH 01/54] iommufd: Make attach_handle generic than fault specific "attach_handle" was added exclusively for the iommufd_fault_iopf_handler() used by IOPF/PRI use cases. Now, both the MSI and PASID series require to reuse the attach_handle for non-fault cases. Add a set of new attach/detach/replace helpers that does the attach_handle allocation/releasing/replacement in the common path and also handles those fault specific routines such as iopf enabling/disabling and auto response. This covers both non-fault and fault cases in a clean way, replacing those inline helpers in the header. The following patch will clean up those old helpers in the fault.c file. Link: https://patch.msgid.link/r/32687df01c02291d89986a9fca897bbbe2b10987.1738645017.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen Reviewed-by: Yi Liu Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 105 ++++++++++++++++++++++++ drivers/iommu/iommufd/fault.c | 8 +- drivers/iommu/iommufd/iommufd_private.h | 33 +------- 3 files changed, 113 insertions(+), 33 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index dfd0898fb6c15..0786290b4056d 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -352,6 +352,111 @@ iommufd_device_attach_reserved_iova(struct iommufd_device *idev, return 0; } +/* The device attach/detach/replace helpers for attach_handle */ + +static int iommufd_hwpt_attach_device(struct iommufd_hw_pagetable *hwpt, + struct iommufd_device *idev) +{ + struct iommufd_attach_handle *handle; + int rc; + + lockdep_assert_held(&idev->igroup->lock); + + handle = kzalloc(sizeof(*handle), GFP_KERNEL); + if (!handle) + return -ENOMEM; + + if (hwpt->fault) { + rc = iommufd_fault_iopf_enable(idev); + if (rc) + goto out_free_handle; + } + + handle->idev = idev; + rc = iommu_attach_group_handle(hwpt->domain, idev->igroup->group, + &handle->handle); + if (rc) + goto out_disable_iopf; + + return 0; + +out_disable_iopf: + if (hwpt->fault) + iommufd_fault_iopf_disable(idev); +out_free_handle: + kfree(handle); + return rc; +} + +static struct iommufd_attach_handle * +iommufd_device_get_attach_handle(struct iommufd_device *idev) +{ + struct iommu_attach_handle *handle; + + lockdep_assert_held(&idev->igroup->lock); + + handle = + iommu_attach_handle_get(idev->igroup->group, IOMMU_NO_PASID, 0); + if (IS_ERR(handle)) + return NULL; + return to_iommufd_handle(handle); +} + +static void iommufd_hwpt_detach_device(struct iommufd_hw_pagetable *hwpt, + struct iommufd_device *idev) +{ + struct iommufd_attach_handle *handle; + + handle = iommufd_device_get_attach_handle(idev); + iommu_detach_group_handle(hwpt->domain, idev->igroup->group); + if (hwpt->fault) { + iommufd_auto_response_faults(hwpt, handle); + iommufd_fault_iopf_disable(idev); + } + kfree(handle); +} + +static int iommufd_hwpt_replace_device(struct iommufd_device *idev, + struct iommufd_hw_pagetable *hwpt, + struct iommufd_hw_pagetable *old) +{ + struct iommufd_attach_handle *handle, *old_handle = + iommufd_device_get_attach_handle(idev); + int rc; + + handle = kzalloc(sizeof(*handle), GFP_KERNEL); + if (!handle) + return -ENOMEM; + + if (hwpt->fault && !old->fault) { + rc = iommufd_fault_iopf_enable(idev); + if (rc) + goto out_free_handle; + } + + handle->idev = idev; + rc = iommu_replace_group_handle(idev->igroup->group, hwpt->domain, + &handle->handle); + if (rc) + goto out_disable_iopf; + + if (old->fault) { + iommufd_auto_response_faults(hwpt, old_handle); + if (!hwpt->fault) + iommufd_fault_iopf_disable(idev); + } + kfree(old_handle); + + return 0; + +out_disable_iopf: + if (hwpt->fault && !old->fault) + iommufd_fault_iopf_disable(idev); +out_free_handle: + kfree(handle); + return rc; +} + int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, struct iommufd_device *idev) { diff --git a/drivers/iommu/iommufd/fault.c b/drivers/iommu/iommufd/fault.c index d9a937450e552..cb844e6799d4f 100644 --- a/drivers/iommu/iommufd/fault.c +++ b/drivers/iommu/iommufd/fault.c @@ -17,7 +17,7 @@ #include "../iommu-priv.h" #include "iommufd_private.h" -static int iommufd_fault_iopf_enable(struct iommufd_device *idev) +int iommufd_fault_iopf_enable(struct iommufd_device *idev) { struct device *dev = idev->dev; int ret; @@ -50,7 +50,7 @@ static int iommufd_fault_iopf_enable(struct iommufd_device *idev) return ret; } -static void iommufd_fault_iopf_disable(struct iommufd_device *idev) +void iommufd_fault_iopf_disable(struct iommufd_device *idev) { mutex_lock(&idev->iopf_lock); if (!WARN_ON(idev->iopf_enabled == 0)) { @@ -98,8 +98,8 @@ int iommufd_fault_domain_attach_dev(struct iommufd_hw_pagetable *hwpt, return ret; } -static void iommufd_auto_response_faults(struct iommufd_hw_pagetable *hwpt, - struct iommufd_attach_handle *handle) +void iommufd_auto_response_faults(struct iommufd_hw_pagetable *hwpt, + struct iommufd_attach_handle *handle) { struct iommufd_fault *fault = hwpt->fault; struct iopf_group *group, *next; diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 0b1bafc7fd994..02fe1ada97cc7 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -504,35 +504,10 @@ int iommufd_fault_domain_replace_dev(struct iommufd_device *idev, struct iommufd_hw_pagetable *hwpt, struct iommufd_hw_pagetable *old); -static inline int iommufd_hwpt_attach_device(struct iommufd_hw_pagetable *hwpt, - struct iommufd_device *idev) -{ - if (hwpt->fault) - return iommufd_fault_domain_attach_dev(hwpt, idev); - - return iommu_attach_group(hwpt->domain, idev->igroup->group); -} - -static inline void iommufd_hwpt_detach_device(struct iommufd_hw_pagetable *hwpt, - struct iommufd_device *idev) -{ - if (hwpt->fault) { - iommufd_fault_domain_detach_dev(hwpt, idev); - return; - } - - iommu_detach_group(hwpt->domain, idev->igroup->group); -} - -static inline int iommufd_hwpt_replace_device(struct iommufd_device *idev, - struct iommufd_hw_pagetable *hwpt, - struct iommufd_hw_pagetable *old) -{ - if (old->fault || hwpt->fault) - return iommufd_fault_domain_replace_dev(idev, hwpt, old); - - return iommu_group_replace_domain(idev->igroup->group, hwpt->domain); -} +int iommufd_fault_iopf_enable(struct iommufd_device *idev); +void iommufd_fault_iopf_disable(struct iommufd_device *idev); +void iommufd_auto_response_faults(struct iommufd_hw_pagetable *hwpt, + struct iommufd_attach_handle *handle); static inline struct iommufd_viommu * iommufd_get_viommu(struct iommufd_ucmd *ucmd, u32 id) From dc10ba25d43f433ad5d9e8e6be4f4d2bb3cd9ddb Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Mon, 3 Feb 2025 21:00:55 -0800 Subject: [PATCH 02/54] iommufd/fault: Remove iommufd_fault_domain_attach/detach/replace_dev() There are new attach/detach/replace helpers in device.c taking care of both the attach_handle and the fault specific routines for iopf_enable/disable() and auto response. Clean up these redundant functions in the fault.c file. Link: https://patch.msgid.link/r/3ca94625e9d78270d9a715fa0809414fddd57e58.1738645017.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen Reviewed-by: Yi Liu Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/fault.c | 120 ------------------------ drivers/iommu/iommufd/iommufd_private.h | 8 -- 2 files changed, 128 deletions(-) diff --git a/drivers/iommu/iommufd/fault.c b/drivers/iommu/iommufd/fault.c index cb844e6799d4f..931a3fbe6e32c 100644 --- a/drivers/iommu/iommufd/fault.c +++ b/drivers/iommu/iommufd/fault.c @@ -60,44 +60,6 @@ void iommufd_fault_iopf_disable(struct iommufd_device *idev) mutex_unlock(&idev->iopf_lock); } -static int __fault_domain_attach_dev(struct iommufd_hw_pagetable *hwpt, - struct iommufd_device *idev) -{ - struct iommufd_attach_handle *handle; - int ret; - - handle = kzalloc(sizeof(*handle), GFP_KERNEL); - if (!handle) - return -ENOMEM; - - handle->idev = idev; - ret = iommu_attach_group_handle(hwpt->domain, idev->igroup->group, - &handle->handle); - if (ret) - kfree(handle); - - return ret; -} - -int iommufd_fault_domain_attach_dev(struct iommufd_hw_pagetable *hwpt, - struct iommufd_device *idev) -{ - int ret; - - if (!hwpt->fault) - return -EINVAL; - - ret = iommufd_fault_iopf_enable(idev); - if (ret) - return ret; - - ret = __fault_domain_attach_dev(hwpt, idev); - if (ret) - iommufd_fault_iopf_disable(idev); - - return ret; -} - void iommufd_auto_response_faults(struct iommufd_hw_pagetable *hwpt, struct iommufd_attach_handle *handle) { @@ -135,88 +97,6 @@ void iommufd_auto_response_faults(struct iommufd_hw_pagetable *hwpt, mutex_unlock(&fault->mutex); } -static struct iommufd_attach_handle * -iommufd_device_get_attach_handle(struct iommufd_device *idev) -{ - struct iommu_attach_handle *handle; - - handle = iommu_attach_handle_get(idev->igroup->group, IOMMU_NO_PASID, 0); - if (IS_ERR(handle)) - return NULL; - - return to_iommufd_handle(handle); -} - -void iommufd_fault_domain_detach_dev(struct iommufd_hw_pagetable *hwpt, - struct iommufd_device *idev) -{ - struct iommufd_attach_handle *handle; - - handle = iommufd_device_get_attach_handle(idev); - iommu_detach_group_handle(hwpt->domain, idev->igroup->group); - iommufd_auto_response_faults(hwpt, handle); - iommufd_fault_iopf_disable(idev); - kfree(handle); -} - -static int __fault_domain_replace_dev(struct iommufd_device *idev, - struct iommufd_hw_pagetable *hwpt, - struct iommufd_hw_pagetable *old) -{ - struct iommufd_attach_handle *handle, *curr = NULL; - int ret; - - if (old->fault) - curr = iommufd_device_get_attach_handle(idev); - - if (hwpt->fault) { - handle = kzalloc(sizeof(*handle), GFP_KERNEL); - if (!handle) - return -ENOMEM; - - handle->idev = idev; - ret = iommu_replace_group_handle(idev->igroup->group, - hwpt->domain, &handle->handle); - } else { - ret = iommu_replace_group_handle(idev->igroup->group, - hwpt->domain, NULL); - } - - if (!ret && curr) { - iommufd_auto_response_faults(old, curr); - kfree(curr); - } - - return ret; -} - -int iommufd_fault_domain_replace_dev(struct iommufd_device *idev, - struct iommufd_hw_pagetable *hwpt, - struct iommufd_hw_pagetable *old) -{ - bool iopf_off = !hwpt->fault && old->fault; - bool iopf_on = hwpt->fault && !old->fault; - int ret; - - if (iopf_on) { - ret = iommufd_fault_iopf_enable(idev); - if (ret) - return ret; - } - - ret = __fault_domain_replace_dev(idev, hwpt, old); - if (ret) { - if (iopf_on) - iommufd_fault_iopf_disable(idev); - return ret; - } - - if (iopf_off) - iommufd_fault_iopf_disable(idev); - - return 0; -} - void iommufd_fault_destroy(struct iommufd_object *obj) { struct iommufd_fault *fault = container_of(obj, struct iommufd_fault, obj); diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 02fe1ada97cc7..8e0e3ab647476 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -496,14 +496,6 @@ int iommufd_fault_alloc(struct iommufd_ucmd *ucmd); void iommufd_fault_destroy(struct iommufd_object *obj); int iommufd_fault_iopf_handler(struct iopf_group *group); -int iommufd_fault_domain_attach_dev(struct iommufd_hw_pagetable *hwpt, - struct iommufd_device *idev); -void iommufd_fault_domain_detach_dev(struct iommufd_hw_pagetable *hwpt, - struct iommufd_device *idev); -int iommufd_fault_domain_replace_dev(struct iommufd_device *idev, - struct iommufd_hw_pagetable *hwpt, - struct iommufd_hw_pagetable *old); - int iommufd_fault_iopf_enable(struct iommufd_device *idev); void iommufd_fault_iopf_disable(struct iommufd_device *idev); void iommufd_auto_response_faults(struct iommufd_hw_pagetable *hwpt, From 6d52cb738a98df6d5a91d96ce5bc557d24343964 Mon Sep 17 00:00:00 2001 From: Matthew Rosato Date: Wed, 12 Feb 2025 16:34:15 -0500 Subject: [PATCH 03/54] s390/pci: check for relaxed translation capability For each zdev, record whether or not CLP indicates relaxed translation capability for the associated device group. Reviewed-by: Niklas Schnelle Tested-by: Niklas Schnelle Signed-off-by: Matthew Rosato Link: https://lore.kernel.org/r/20250212213418.182902-2-mjrosato@linux.ibm.com Signed-off-by: Joerg Roedel --- arch/s390/include/asm/pci.h | 2 +- arch/s390/include/asm/pci_clp.h | 4 +++- arch/s390/pci/pci_clp.c | 1 + 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index 474e1f8d1d3c2..8fe4c7a72c0b4 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -144,7 +144,7 @@ struct zpci_dev { u8 util_str_avail : 1; u8 irqs_registered : 1; u8 tid_avail : 1; - u8 reserved : 1; + u8 rtr_avail : 1; /* Relaxed translation allowed */ unsigned int devfn; /* DEVFN part of the RID*/ u8 pfip[CLP_PFIP_NR_SEGMENTS]; /* pci function internal path */ diff --git a/arch/s390/include/asm/pci_clp.h b/arch/s390/include/asm/pci_clp.h index 3fff2f7095c8c..7ebff39c84b34 100644 --- a/arch/s390/include/asm/pci_clp.h +++ b/arch/s390/include/asm/pci_clp.h @@ -156,7 +156,9 @@ struct clp_rsp_query_pci_grp { u16 : 4; u16 noi : 12; /* number of interrupts */ u8 version; - u8 : 6; + u8 : 2; + u8 rtr : 1; /* Relaxed translation requirement */ + u8 : 3; u8 frame : 1; u8 refresh : 1; /* TLB refresh mode */ u16 : 3; diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c index 14bf7e8d06b7a..27248686e5888 100644 --- a/arch/s390/pci/pci_clp.c +++ b/arch/s390/pci/pci_clp.c @@ -112,6 +112,7 @@ static void clp_store_query_pci_fngrp(struct zpci_dev *zdev, zdev->version = response->version; zdev->maxstbl = response->maxstbl; zdev->dtsm = response->dtsm; + zdev->rtr_avail = response->rtr; switch (response->version) { case 1: From d236843a6964dc5a55dbafa5cfae63bc99cf10f8 Mon Sep 17 00:00:00 2001 From: Matthew Rosato Date: Wed, 12 Feb 2025 16:34:16 -0500 Subject: [PATCH 04/54] s390/pci: store DMA offset in bus_dma_region PCI devices on s390 have a DMA offset that is reported via CLP. In preparation for allowing identity domains, setup the bus_dma_region for all PCI devices using the reported CLP value. Signed-off-by: Matthew Rosato Reviewed-by: Niklas Schnelle Tested-by: Niklas Schnelle Link: https://lore.kernel.org/r/20250212213418.182902-3-mjrosato@linux.ibm.com Signed-off-by: Joerg Roedel --- arch/s390/pci/pci_bus.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/arch/s390/pci/pci_bus.c b/arch/s390/pci/pci_bus.c index 39a481ec4a402..0e725039861f9 100644 --- a/arch/s390/pci/pci_bus.c +++ b/arch/s390/pci/pci_bus.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -283,10 +284,34 @@ static struct zpci_bus *zpci_bus_alloc(int topo, bool topo_is_tid) return zbus; } +static void pci_dma_range_setup(struct pci_dev *pdev) +{ + struct zpci_dev *zdev = to_zpci(pdev); + struct bus_dma_region *map; + u64 aligned_end; + + map = kzalloc(sizeof(*map), GFP_KERNEL); + if (!map) + return; + + map->cpu_start = 0; + map->dma_start = PAGE_ALIGN(zdev->start_dma); + aligned_end = PAGE_ALIGN_DOWN(zdev->end_dma + 1); + if (aligned_end >= map->dma_start) + map->size = aligned_end - map->dma_start; + else + map->size = 0; + WARN_ON_ONCE(map->size == 0); + + pdev->dev.dma_range_map = map; +} + void pcibios_bus_add_device(struct pci_dev *pdev) { struct zpci_dev *zdev = to_zpci(pdev); + pci_dma_range_setup(pdev); + /* * With pdev->no_vf_scan the common PCI probing code does not * perform PF/VF linking. From 0ed5967a0a63552e5ca0f2382ce362f84e83801c Mon Sep 17 00:00:00 2001 From: Matthew Rosato Date: Wed, 12 Feb 2025 16:34:17 -0500 Subject: [PATCH 05/54] iommu/s390: handle IOAT registration based on domain At this point, the dma_table is really a property of the s390-iommu domain. Rather than checking its contents elsewhere in the codebase, move the code that registers the table with firmware into s390-iommu and make a decision what to register with firmware based upon the type of domain in use for the device in question. Tested-by: Niklas Schnelle Reviewed-by: Niklas Schnelle Signed-off-by: Matthew Rosato Link: https://lore.kernel.org/r/20250212213418.182902-4-mjrosato@linux.ibm.com Signed-off-by: Joerg Roedel --- arch/s390/include/asm/pci.h | 2 ++ arch/s390/kvm/pci.c | 17 ++------------- arch/s390/pci/pci.c | 35 +++++++++++++++++------------- arch/s390/pci/pci_sysfs.c | 11 +--------- drivers/iommu/s390-iommu.c | 43 +++++++++++++++++++++++++++++++++++-- 5 files changed, 66 insertions(+), 42 deletions(-) diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index 8fe4c7a72c0b4..7cc6a7cc66e75 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -217,6 +217,7 @@ extern struct airq_iv *zpci_aif_sbv; struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state); int zpci_add_device(struct zpci_dev *zdev); int zpci_enable_device(struct zpci_dev *); +int zpci_reenable_device(struct zpci_dev *zdev); int zpci_disable_device(struct zpci_dev *); int zpci_scan_configured_device(struct zpci_dev *zdev, u32 fh); int zpci_deconfigure_device(struct zpci_dev *zdev); @@ -245,6 +246,7 @@ void update_uid_checking(bool new); /* IOMMU Interface */ int zpci_init_iommu(struct zpci_dev *zdev); void zpci_destroy_iommu(struct zpci_dev *zdev); +int zpci_iommu_register_ioat(struct zpci_dev *zdev, u8 *status); #ifdef CONFIG_PCI static inline bool zpci_use_mio(struct zpci_dev *zdev) diff --git a/arch/s390/kvm/pci.c b/arch/s390/kvm/pci.c index 9b9e7fdd53807..8c40154ff50ff 100644 --- a/arch/s390/kvm/pci.c +++ b/arch/s390/kvm/pci.c @@ -433,7 +433,6 @@ static void kvm_s390_pci_dev_release(struct zpci_dev *zdev) static int kvm_s390_pci_register_kvm(void *opaque, struct kvm *kvm) { struct zpci_dev *zdev = opaque; - u8 status; int rc; if (!zdev) @@ -480,13 +479,7 @@ static int kvm_s390_pci_register_kvm(void *opaque, struct kvm *kvm) */ zdev->gisa = (u32)virt_to_phys(&kvm->arch.sie_page2->gisa); - rc = zpci_enable_device(zdev); - if (rc) - goto clear_gisa; - - /* Re-register the IOMMU that was already created */ - rc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, - virt_to_phys(zdev->dma_table), &status); + rc = zpci_reenable_device(zdev); if (rc) goto clear_gisa; @@ -516,7 +509,6 @@ static void kvm_s390_pci_unregister_kvm(void *opaque) { struct zpci_dev *zdev = opaque; struct kvm *kvm; - u8 status; if (!zdev) return; @@ -550,12 +542,7 @@ static void kvm_s390_pci_unregister_kvm(void *opaque) goto out; } - if (zpci_enable_device(zdev)) - goto out; - - /* Re-register the IOMMU that was already created */ - zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, - virt_to_phys(zdev->dma_table), &status); + zpci_reenable_device(zdev); out: spin_lock(&kvm->arch.kzdev_list_lock); diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index 88f72745fa59e..fa879351efb56 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -124,14 +124,13 @@ int zpci_register_ioat(struct zpci_dev *zdev, u8 dmaas, struct zpci_fib fib = {0}; u8 cc; - WARN_ON_ONCE(iota & 0x3fff); fib.pba = base; /* Work around off by one in ISM virt device */ if (zdev->pft == PCI_FUNC_TYPE_ISM && limit > base) fib.pal = limit + (1 << 12); else fib.pal = limit; - fib.iota = iota | ZPCI_IOTA_RTTO_FLAG; + fib.iota = iota; fib.gd = zdev->gisa; cc = zpci_mod_fc(req, &fib, status); if (cc) @@ -690,6 +689,23 @@ int zpci_enable_device(struct zpci_dev *zdev) } EXPORT_SYMBOL_GPL(zpci_enable_device); +int zpci_reenable_device(struct zpci_dev *zdev) +{ + u8 status; + int rc; + + rc = zpci_enable_device(zdev); + if (rc) + return rc; + + rc = zpci_iommu_register_ioat(zdev, &status); + if (rc) + zpci_disable_device(zdev); + + return rc; +} +EXPORT_SYMBOL_GPL(zpci_reenable_device); + int zpci_disable_device(struct zpci_dev *zdev) { u32 fh = zdev->fh; @@ -739,7 +755,6 @@ EXPORT_SYMBOL_GPL(zpci_disable_device); */ int zpci_hot_reset_device(struct zpci_dev *zdev) { - u8 status; int rc; lockdep_assert_held(&zdev->state_lock); @@ -758,19 +773,9 @@ int zpci_hot_reset_device(struct zpci_dev *zdev) return rc; } - rc = zpci_enable_device(zdev); - if (rc) - return rc; - - if (zdev->dma_table) - rc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, - virt_to_phys(zdev->dma_table), &status); - if (rc) { - zpci_disable_device(zdev); - return rc; - } + rc = zpci_reenable_device(zdev); - return 0; + return rc; } /** diff --git a/arch/s390/pci/pci_sysfs.c b/arch/s390/pci/pci_sysfs.c index 2de1ea6c3a8cb..0ecad08e1b1ee 100644 --- a/arch/s390/pci/pci_sysfs.c +++ b/arch/s390/pci/pci_sysfs.c @@ -52,7 +52,6 @@ static DEVICE_ATTR_RO(mio_enabled); static int _do_recover(struct pci_dev *pdev, struct zpci_dev *zdev) { - u8 status; int ret; pci_stop_and_remove_bus_device(pdev); @@ -70,16 +69,8 @@ static int _do_recover(struct pci_dev *pdev, struct zpci_dev *zdev) return ret; } - ret = zpci_enable_device(zdev); - if (ret) - return ret; + ret = zpci_reenable_device(zdev); - if (zdev->dma_table) { - ret = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, - virt_to_phys(zdev->dma_table), &status); - if (ret) - zpci_disable_device(zdev); - } return ret; } diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c index fbdeded3d48b5..007ccfdad4954 100644 --- a/drivers/iommu/s390-iommu.c +++ b/drivers/iommu/s390-iommu.c @@ -381,6 +381,46 @@ static void zdev_s390_domain_update(struct zpci_dev *zdev, spin_unlock_irqrestore(&zdev->dom_lock, flags); } +static int s390_iommu_domain_reg_ioat(struct zpci_dev *zdev, + struct iommu_domain *domain, u8 *status) +{ + struct s390_domain *s390_domain; + int rc = 0; + u64 iota; + + switch (domain->type) { + case IOMMU_DOMAIN_IDENTITY: + rc = zpci_register_ioat(zdev, 0, zdev->start_dma, + zdev->end_dma, 0, status); + break; + case IOMMU_DOMAIN_BLOCKED: + /* Nothing to do in this case */ + break; + default: + s390_domain = to_s390_domain(domain); + iota = virt_to_phys(s390_domain->dma_table) | + ZPCI_IOTA_RTTO_FLAG; + rc = zpci_register_ioat(zdev, 0, zdev->start_dma, + zdev->end_dma, iota, status); + } + + return rc; +} + +int zpci_iommu_register_ioat(struct zpci_dev *zdev, u8 *status) +{ + unsigned long flags; + int rc; + + spin_lock_irqsave(&zdev->dom_lock, flags); + + rc = s390_iommu_domain_reg_ioat(zdev, zdev->s390_domain, status); + + spin_unlock_irqrestore(&zdev->dom_lock, flags); + + return rc; +} + static int blocking_domain_attach_device(struct iommu_domain *domain, struct device *dev) { @@ -422,8 +462,7 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, blocking_domain_attach_device(&blocking_domain, dev); /* If we fail now DMA remains blocked via blocking domain */ - cc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, - virt_to_phys(s390_domain->dma_table), &status); + cc = s390_iommu_domain_reg_ioat(zdev, domain, &status); if (cc && status != ZPCI_PCI_ST_FUNC_NOT_AVAIL) return -EIO; zdev->dma_table = s390_domain->dma_table; From 64af12c6ec3afd7d44bc8b2044eee59f98059087 Mon Sep 17 00:00:00 2001 From: Matthew Rosato Date: Wed, 12 Feb 2025 16:34:18 -0500 Subject: [PATCH 06/54] iommu/s390: implement iommu passthrough via identity domain Enabled via the kernel command-line 'iommu.passthrough=1' option. Introduce the concept of identity domains to s390-iommu, which relies on the bus_dma_region to offset identity mappings to the start of the DMA aperture advertized by CLP. Tested-by: Niklas Schnelle Reviewed-by: Niklas Schnelle Signed-off-by: Matthew Rosato Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20250212213418.182902-5-mjrosato@linux.ibm.com Signed-off-by: Joerg Roedel --- drivers/iommu/s390-iommu.c | 95 +++++++++++++++++++++++++++++--------- 1 file changed, 72 insertions(+), 23 deletions(-) diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c index 007ccfdad4954..e1c76e0f9c2b7 100644 --- a/drivers/iommu/s390-iommu.c +++ b/drivers/iommu/s390-iommu.c @@ -16,7 +16,7 @@ #include "dma-iommu.h" -static const struct iommu_ops s390_iommu_ops; +static const struct iommu_ops s390_iommu_ops, s390_iommu_rtr_ops; static struct kmem_cache *dma_region_table_cache; static struct kmem_cache *dma_page_table_cache; @@ -432,9 +432,11 @@ static int blocking_domain_attach_device(struct iommu_domain *domain, return 0; s390_domain = to_s390_domain(zdev->s390_domain); - spin_lock_irqsave(&s390_domain->list_lock, flags); - list_del_rcu(&zdev->iommu_list); - spin_unlock_irqrestore(&s390_domain->list_lock, flags); + if (zdev->dma_table) { + spin_lock_irqsave(&s390_domain->list_lock, flags); + list_del_rcu(&zdev->iommu_list); + spin_unlock_irqrestore(&s390_domain->list_lock, flags); + } zpci_unregister_ioat(zdev, 0); zdev->dma_table = NULL; @@ -762,7 +764,13 @@ int zpci_init_iommu(struct zpci_dev *zdev) if (rc) goto out_err; - rc = iommu_device_register(&zdev->iommu_dev, &s390_iommu_ops, NULL); + if (zdev->rtr_avail) { + rc = iommu_device_register(&zdev->iommu_dev, + &s390_iommu_rtr_ops, NULL); + } else { + rc = iommu_device_register(&zdev->iommu_dev, &s390_iommu_ops, + NULL); + } if (rc) goto out_sysfs; @@ -826,6 +834,39 @@ static int __init s390_iommu_init(void) } subsys_initcall(s390_iommu_init); +static int s390_attach_dev_identity(struct iommu_domain *domain, + struct device *dev) +{ + struct zpci_dev *zdev = to_zpci_dev(dev); + u8 status; + int cc; + + blocking_domain_attach_device(&blocking_domain, dev); + + /* If we fail now DMA remains blocked via blocking domain */ + cc = s390_iommu_domain_reg_ioat(zdev, domain, &status); + + /* + * If the device is undergoing error recovery the reset code + * will re-establish the new domain. + */ + if (cc && status != ZPCI_PCI_ST_FUNC_NOT_AVAIL) + return -EIO; + + zdev_s390_domain_update(zdev, domain); + + return 0; +} + +static const struct iommu_domain_ops s390_identity_ops = { + .attach_dev = s390_attach_dev_identity, +}; + +static struct iommu_domain s390_identity_domain = { + .type = IOMMU_DOMAIN_IDENTITY, + .ops = &s390_identity_ops, +}; + static struct iommu_domain blocking_domain = { .type = IOMMU_DOMAIN_BLOCKED, .ops = &(const struct iommu_domain_ops) { @@ -833,23 +874,31 @@ static struct iommu_domain blocking_domain = { } }; -static const struct iommu_ops s390_iommu_ops = { - .blocked_domain = &blocking_domain, - .release_domain = &blocking_domain, - .capable = s390_iommu_capable, - .domain_alloc_paging = s390_domain_alloc_paging, - .probe_device = s390_iommu_probe_device, - .device_group = generic_device_group, - .pgsize_bitmap = SZ_4K, - .get_resv_regions = s390_iommu_get_resv_regions, - .default_domain_ops = &(const struct iommu_domain_ops) { - .attach_dev = s390_iommu_attach_device, - .map_pages = s390_iommu_map_pages, - .unmap_pages = s390_iommu_unmap_pages, - .flush_iotlb_all = s390_iommu_flush_iotlb_all, - .iotlb_sync = s390_iommu_iotlb_sync, - .iotlb_sync_map = s390_iommu_iotlb_sync_map, - .iova_to_phys = s390_iommu_iova_to_phys, - .free = s390_domain_free, +#define S390_IOMMU_COMMON_OPS() \ + .blocked_domain = &blocking_domain, \ + .release_domain = &blocking_domain, \ + .capable = s390_iommu_capable, \ + .domain_alloc_paging = s390_domain_alloc_paging, \ + .probe_device = s390_iommu_probe_device, \ + .device_group = generic_device_group, \ + .pgsize_bitmap = SZ_4K, \ + .get_resv_regions = s390_iommu_get_resv_regions, \ + .default_domain_ops = &(const struct iommu_domain_ops) { \ + .attach_dev = s390_iommu_attach_device, \ + .map_pages = s390_iommu_map_pages, \ + .unmap_pages = s390_iommu_unmap_pages, \ + .flush_iotlb_all = s390_iommu_flush_iotlb_all, \ + .iotlb_sync = s390_iommu_iotlb_sync, \ + .iotlb_sync_map = s390_iommu_iotlb_sync_map, \ + .iova_to_phys = s390_iommu_iova_to_phys, \ + .free = s390_domain_free, \ } + +static const struct iommu_ops s390_iommu_ops = { + S390_IOMMU_COMMON_OPS() +}; + +static const struct iommu_ops s390_iommu_rtr_ops = { + .identity_domain = &s390_identity_domain, + S390_IOMMU_COMMON_OPS() }; From 5276c1e07679c44ee021e88045bbb5190831b328 Mon Sep 17 00:00:00 2001 From: Asahi Lina Date: Wed, 19 Feb 2025 10:13:53 +0100 Subject: [PATCH 07/54] iommu/io-pgtable-dart: Only set subpage protection disable for DART 1 Subpage protection can't be disabled on t6000-style darts, as such the disable flag no longer applies, and probably even affects something else. Fixes: dc09fe1c5edd ("iommu/io-pgtable-dart: Add DART PTE support for t6000") Signed-off-by: Asahi Lina Signed-off-by: Sasha Finkelstein Reviewed-by: Sven Peter Link: https://lore.kernel.org/r/20250219-dart2-no-sp-disable-v1-1-9f324cfa4e70@gmail.com Signed-off-by: Joerg Roedel --- drivers/iommu/io-pgtable-dart.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/io-pgtable-dart.c b/drivers/iommu/io-pgtable-dart.c index c004640640ee5..06aca9ab52f9a 100644 --- a/drivers/iommu/io-pgtable-dart.c +++ b/drivers/iommu/io-pgtable-dart.c @@ -135,7 +135,6 @@ static int dart_init_pte(struct dart_io_pgtable *data, pte |= FIELD_PREP(APPLE_DART_PTE_SUBPAGE_START, 0); pte |= FIELD_PREP(APPLE_DART_PTE_SUBPAGE_END, 0xfff); - pte |= APPLE_DART1_PTE_PROT_SP_DIS; pte |= APPLE_DART_PTE_VALID; for (i = 0; i < num_entries; i++) @@ -211,6 +210,7 @@ static dart_iopte dart_prot_to_pte(struct dart_io_pgtable *data, dart_iopte pte = 0; if (data->iop.fmt == APPLE_DART) { + pte |= APPLE_DART1_PTE_PROT_SP_DIS; if (!(prot & IOMMU_WRITE)) pte |= APPLE_DART1_PTE_PROT_NO_WRITE; if (!(prot & IOMMU_READ)) From 1f7df3a691740a7736bbc99dc4ed536120eb4746 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 19 Feb 2025 17:31:36 -0800 Subject: [PATCH 08/54] genirq/msi: Store the IOMMU IOVA directly in msi_desc instead of iommu_cookie The IOMMU translation for MSI message addresses has been a 2-step process, separated in time: 1) iommu_dma_prepare_msi(): A cookie pointer containing the IOVA address is stored in the MSI descriptor when an MSI interrupt is allocated. 2) iommu_dma_compose_msi_msg(): this cookie pointer is used to compute a translated message address. This has an inherent lifetime problem for the pointer stored in the cookie that must remain valid between the two steps. However, there is no locking at the irq layer that helps protect the lifetime. Today, this works under the assumption that the iommu domain is not changed while MSI interrupts being programmed. This is true for normal DMA API users within the kernel, as the iommu domain is attached before the driver is probed and cannot be changed while a driver is attached. Classic VFIO type1 also prevented changing the iommu domain while VFIO was running as it does not support changing the "container" after starting up. However, iommufd has improved this so that the iommu domain can be changed during VFIO operation. This potentially allows userspace to directly race VFIO_DEVICE_ATTACH_IOMMUFD_PT (which calls iommu_attach_group()) and VFIO_DEVICE_SET_IRQS (which calls into iommu_dma_compose_msi_msg()). This potentially causes both the cookie pointer and the unlocked call to iommu_get_domain_for_dev() on the MSI translation path to become UAFs. Fix the MSI cookie UAF by removing the cookie pointer. The translated IOVA address is already known during iommu_dma_prepare_msi() and cannot change. Thus, it can simply be stored as an integer in the MSI descriptor. The other UAF related to iommu_get_domain_for_dev() will be addressed in patch "iommu: Make iommu_dma_prepare_msi() into a generic operation" by using the IOMMU group mutex. Link: https://patch.msgid.link/r/a4f2cd76b9dc1833ee6c1cf325cba57def22231c.1740014950.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen Reviewed-by: Thomas Gleixner Signed-off-by: Jason Gunthorpe --- drivers/iommu/dma-iommu.c | 28 +++++++++++++--------------- include/linux/msi.h | 33 ++++++++++++--------------------- 2 files changed, 25 insertions(+), 36 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 2a9fa0c8cc00f..0f0caf59023c7 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -1815,7 +1815,7 @@ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) static DEFINE_MUTEX(msi_prepare_lock); /* see below */ if (!domain || !domain->iova_cookie) { - desc->iommu_cookie = NULL; + msi_desc_set_iommu_msi_iova(desc, 0, 0); return 0; } @@ -1827,11 +1827,12 @@ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) mutex_lock(&msi_prepare_lock); msi_page = iommu_dma_get_msi_page(dev, msi_addr, domain); mutex_unlock(&msi_prepare_lock); - - msi_desc_set_iommu_cookie(desc, msi_page); - if (!msi_page) return -ENOMEM; + + msi_desc_set_iommu_msi_iova( + desc, msi_page->iova, + ilog2(cookie_msi_granule(domain->iova_cookie))); return 0; } @@ -1842,18 +1843,15 @@ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) */ void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg) { - struct device *dev = msi_desc_to_dev(desc); - const struct iommu_domain *domain = iommu_get_domain_for_dev(dev); - const struct iommu_dma_msi_page *msi_page; +#ifdef CONFIG_IRQ_MSI_IOMMU + if (desc->iommu_msi_shift) { + u64 msi_iova = desc->iommu_msi_iova << desc->iommu_msi_shift; - msi_page = msi_desc_get_iommu_cookie(desc); - - if (!domain || !domain->iova_cookie || WARN_ON(!msi_page)) - return; - - msg->address_hi = upper_32_bits(msi_page->iova); - msg->address_lo &= cookie_msi_granule(domain->iova_cookie) - 1; - msg->address_lo += lower_32_bits(msi_page->iova); + msg->address_hi = upper_32_bits(msi_iova); + msg->address_lo = lower_32_bits(msi_iova) | + (msg->address_lo & ((1 << desc->iommu_msi_shift) - 1)); + } +#endif } static int iommu_dma_init(void) diff --git a/include/linux/msi.h b/include/linux/msi.h index b10093c4d00ea..fc4f3774c3af6 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -166,6 +166,10 @@ struct msi_desc_data { * @dev: Pointer to the device which uses this descriptor * @msg: The last set MSI message cached for reuse * @affinity: Optional pointer to a cpu affinity mask for this descriptor + * @iommu_msi_iova: Optional shifted IOVA from the IOMMU to override the msi_addr. + * Only used if iommu_msi_shift != 0 + * @iommu_msi_shift: Indicates how many bits of the original address should be + * preserved when using iommu_msi_iova. * @sysfs_attr: Pointer to sysfs device attribute * * @write_msi_msg: Callback that may be called when the MSI message @@ -184,7 +188,8 @@ struct msi_desc { struct msi_msg msg; struct irq_affinity_desc *affinity; #ifdef CONFIG_IRQ_MSI_IOMMU - const void *iommu_cookie; + u64 iommu_msi_iova : 58; + u64 iommu_msi_shift : 6; #endif #ifdef CONFIG_SYSFS struct device_attribute *sysfs_attrs; @@ -285,28 +290,14 @@ struct msi_desc *msi_next_desc(struct device *dev, unsigned int domid, #define msi_desc_to_dev(desc) ((desc)->dev) -#ifdef CONFIG_IRQ_MSI_IOMMU -static inline const void *msi_desc_get_iommu_cookie(struct msi_desc *desc) -{ - return desc->iommu_cookie; -} - -static inline void msi_desc_set_iommu_cookie(struct msi_desc *desc, - const void *iommu_cookie) -{ - desc->iommu_cookie = iommu_cookie; -} -#else -static inline const void *msi_desc_get_iommu_cookie(struct msi_desc *desc) +static inline void msi_desc_set_iommu_msi_iova(struct msi_desc *desc, u64 msi_iova, + unsigned int msi_shift) { - return NULL; -} - -static inline void msi_desc_set_iommu_cookie(struct msi_desc *desc, - const void *iommu_cookie) -{ -} +#ifdef CONFIG_IRQ_MSI_IOMMU + desc->iommu_msi_iova = msi_iova >> msi_shift; + desc->iommu_msi_shift = msi_shift; #endif +} int msi_domain_insert_msi_desc(struct device *dev, unsigned int domid, struct msi_desc *init_desc); From 9349887e93009331e751854843f73a086bef4018 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 19 Feb 2025 17:31:37 -0800 Subject: [PATCH 09/54] genirq/msi: Refactor iommu_dma_compose_msi_msg() The two-step process to translate the MSI address involves two functions, iommu_dma_prepare_msi() and iommu_dma_compose_msi_msg(). Previously iommu_dma_compose_msi_msg() needed to be in the iommu layer as it had to dereference the opaque cookie pointer. Now, the previous patch changed the cookie pointer into an integer so there is no longer any need for the iommu layer to be involved. Further, the call sites of iommu_dma_compose_msi_msg() all follow the same pattern of setting an MSI message address_hi/lo to non-translated and then immediately calling iommu_dma_compose_msi_msg(). Refactor iommu_dma_compose_msi_msg() into msi_msg_set_addr() that directly accepts the u64 version of the address and simplifies all the callers. Move the new helper to linux/msi.h since it has nothing to do with iommu. Aside from refactoring, this logically prepares for the next patch, which allows multiple implementation options for iommu_dma_prepare_msi(). So, it does not make sense to have the iommu_dma_compose_msi_msg() in dma-iommu.c as it no longer provides the only iommu_dma_prepare_msi() implementation. Link: https://patch.msgid.link/r/eda62a9bafa825e9cdabd7ddc61ad5a21c32af24.1740014950.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen Reviewed-by: Thomas Gleixner Signed-off-by: Jason Gunthorpe --- drivers/iommu/dma-iommu.c | 18 ------------------ drivers/irqchip/irq-gic-v2m.c | 5 +---- drivers/irqchip/irq-gic-v3-its.c | 13 +++---------- drivers/irqchip/irq-gic-v3-mbi.c | 12 ++++-------- drivers/irqchip/irq-ls-scfg-msi.c | 5 ++--- include/linux/iommu.h | 6 ------ include/linux/msi.h | 28 ++++++++++++++++++++++++++++ 7 files changed, 38 insertions(+), 49 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 0f0caf59023c7..bf91e014d1791 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -1836,24 +1836,6 @@ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) return 0; } -/** - * iommu_dma_compose_msi_msg() - Apply translation to an MSI message - * @desc: MSI descriptor prepared by iommu_dma_prepare_msi() - * @msg: MSI message containing target physical address - */ -void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg) -{ -#ifdef CONFIG_IRQ_MSI_IOMMU - if (desc->iommu_msi_shift) { - u64 msi_iova = desc->iommu_msi_iova << desc->iommu_msi_shift; - - msg->address_hi = upper_32_bits(msi_iova); - msg->address_lo = lower_32_bits(msi_iova) | - (msg->address_lo & ((1 << desc->iommu_msi_shift) - 1)); - } -#endif -} - static int iommu_dma_init(void) { if (is_kdump_kernel()) diff --git a/drivers/irqchip/irq-gic-v2m.c b/drivers/irqchip/irq-gic-v2m.c index be35c5349986a..57e0470e0d133 100644 --- a/drivers/irqchip/irq-gic-v2m.c +++ b/drivers/irqchip/irq-gic-v2m.c @@ -87,9 +87,6 @@ static void gicv2m_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) struct v2m_data *v2m = irq_data_get_irq_chip_data(data); phys_addr_t addr = gicv2m_get_msi_addr(v2m, data->hwirq); - msg->address_hi = upper_32_bits(addr); - msg->address_lo = lower_32_bits(addr); - if (v2m->flags & GICV2M_GRAVITON_ADDRESS_ONLY) msg->data = 0; else @@ -97,7 +94,7 @@ static void gicv2m_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) if (v2m->flags & GICV2M_NEEDS_SPI_OFFSET) msg->data -= v2m->spi_offset; - iommu_dma_compose_msi_msg(irq_data_get_msi_desc(data), msg); + msi_msg_set_addr(irq_data_get_msi_desc(data), msg, addr); } static struct irq_chip gicv2m_irq_chip = { diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 8c3ec5734f1ef..ce0bf70b9eaf8 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -1809,17 +1809,10 @@ static u64 its_irq_get_msi_base(struct its_device *its_dev) static void its_irq_compose_msi_msg(struct irq_data *d, struct msi_msg *msg) { struct its_device *its_dev = irq_data_get_irq_chip_data(d); - struct its_node *its; - u64 addr; - - its = its_dev->its; - addr = its->get_msi_base(its_dev); - - msg->address_lo = lower_32_bits(addr); - msg->address_hi = upper_32_bits(addr); - msg->data = its_get_event_id(d); - iommu_dma_compose_msi_msg(irq_data_get_msi_desc(d), msg); + msg->data = its_get_event_id(d); + msi_msg_set_addr(irq_data_get_msi_desc(d), msg, + its_dev->its->get_msi_base(its_dev)); } static int its_irq_set_irqchip_state(struct irq_data *d, diff --git a/drivers/irqchip/irq-gic-v3-mbi.c b/drivers/irqchip/irq-gic-v3-mbi.c index 3fe870f8ee174..a6510128611e0 100644 --- a/drivers/irqchip/irq-gic-v3-mbi.c +++ b/drivers/irqchip/irq-gic-v3-mbi.c @@ -147,22 +147,18 @@ static const struct irq_domain_ops mbi_domain_ops = { static void mbi_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) { - msg[0].address_hi = upper_32_bits(mbi_phys_base + GICD_SETSPI_NSR); - msg[0].address_lo = lower_32_bits(mbi_phys_base + GICD_SETSPI_NSR); msg[0].data = data->parent_data->hwirq; - - iommu_dma_compose_msi_msg(irq_data_get_msi_desc(data), msg); + msi_msg_set_addr(irq_data_get_msi_desc(data), &msg[0], + mbi_phys_base + GICD_SETSPI_NSR); } static void mbi_compose_mbi_msg(struct irq_data *data, struct msi_msg *msg) { mbi_compose_msi_msg(data, msg); - msg[1].address_hi = upper_32_bits(mbi_phys_base + GICD_CLRSPI_NSR); - msg[1].address_lo = lower_32_bits(mbi_phys_base + GICD_CLRSPI_NSR); msg[1].data = data->parent_data->hwirq; - - iommu_dma_compose_msi_msg(irq_data_get_msi_desc(data), &msg[1]); + msi_msg_set_addr(irq_data_get_msi_desc(data), &msg[1], + mbi_phys_base + GICD_CLRSPI_NSR); } static bool mbi_init_dev_msi_info(struct device *dev, struct irq_domain *domain, diff --git a/drivers/irqchip/irq-ls-scfg-msi.c b/drivers/irqchip/irq-ls-scfg-msi.c index c0e1aafe468c3..3cb80796cc7cc 100644 --- a/drivers/irqchip/irq-ls-scfg-msi.c +++ b/drivers/irqchip/irq-ls-scfg-msi.c @@ -87,8 +87,6 @@ static void ls_scfg_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) { struct ls_scfg_msi *msi_data = irq_data_get_irq_chip_data(data); - msg->address_hi = upper_32_bits(msi_data->msiir_addr); - msg->address_lo = lower_32_bits(msi_data->msiir_addr); msg->data = data->hwirq; if (msi_affinity_flag) { @@ -98,7 +96,8 @@ static void ls_scfg_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) msg->data |= cpumask_first(mask); } - iommu_dma_compose_msi_msg(irq_data_get_msi_desc(data), msg); + msi_msg_set_addr(irq_data_get_msi_desc(data), msg, + msi_data->msiir_addr); } static int ls_scfg_msi_set_affinity(struct irq_data *irq_data, diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 38c65e92ecd09..caee952febd4f 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1508,7 +1508,6 @@ static inline void iommu_debugfs_setup(void) {} int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base); int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr); -void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg); #else /* CONFIG_IOMMU_DMA */ @@ -1524,11 +1523,6 @@ static inline int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_a { return 0; } - -static inline void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg) -{ -} - #endif /* CONFIG_IOMMU_DMA */ /* diff --git a/include/linux/msi.h b/include/linux/msi.h index fc4f3774c3af6..8d97b890faeca 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -299,6 +299,34 @@ static inline void msi_desc_set_iommu_msi_iova(struct msi_desc *desc, u64 msi_io #endif } +/** + * msi_msg_set_addr() - Set MSI address in an MSI message + * + * @desc: MSI descriptor that may carry an IOVA base address for MSI via @iommu_msi_iova/shift + * @msg: Target MSI message to set its address_hi and address_lo + * @msi_addr: Physical address to set the MSI message + * + * Notes: + * - Override @msi_addr using the IOVA base address in the @desc if @iommu_msi_shift is set + * - Otherwise, simply set @msi_addr to @msg + */ +static inline void msi_msg_set_addr(struct msi_desc *desc, struct msi_msg *msg, + phys_addr_t msi_addr) +{ +#ifdef CONFIG_IRQ_MSI_IOMMU + if (desc->iommu_msi_shift) { + u64 msi_iova = desc->iommu_msi_iova << desc->iommu_msi_shift; + + msg->address_hi = upper_32_bits(msi_iova); + msg->address_lo = lower_32_bits(msi_iova) | + (msi_addr & ((1 << desc->iommu_msi_shift) - 1)); + return; + } +#endif + msg->address_hi = upper_32_bits(msi_addr); + msg->address_lo = lower_32_bits(msi_addr); +} + int msi_domain_insert_msi_desc(struct device *dev, unsigned int domid, struct msi_desc *init_desc); /** From 288683c92b1abc32277c83819bea287af614d239 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 19 Feb 2025 17:31:38 -0800 Subject: [PATCH 10/54] iommu: Make iommu_dma_prepare_msi() into a generic operation SW_MSI supports IOMMU to translate an MSI message before the MSI message is delivered to the interrupt controller. On such systems, an iommu_domain must have a translation for the MSI message for interrupts to work. The IRQ subsystem will call into IOMMU to request that a physical page be set up to receive MSI messages, and the IOMMU then sets an IOVA that maps to that physical page. Ultimately the IOVA is programmed into the device via the msi_msg. Generalize this by allowing iommu_domain owners to provide implementations of this mapping. Add a function pointer in struct iommu_domain to allow a domain owner to provide its own implementation. Have dma-iommu supply its implementation for IOMMU_DOMAIN_DMA types during the iommu_get_dma_cookie() path. For IOMMU_DOMAIN_UNMANAGED types used by VFIO (and iommufd for now), have the same iommu_dma_sw_msi set as well in the iommu_get_msi_cookie() path. Hold the group mutex while in iommu_dma_prepare_msi() to ensure the domain doesn't change or become freed while running. Races with IRQ operations from VFIO and domain changes from iommufd are possible here. Replace the msi_prepare_lock with a lockdep assertion for the group mutex as documentation. For the dmau_iommu.c each iommu_domain is unique to a group. Link: https://patch.msgid.link/r/4ca696150d2baee03af27c4ddefdb7b0b0280e7b.1740014950.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/dma-iommu.c | 33 +++++++++++++---------------- drivers/iommu/iommu.c | 29 ++++++++++++++++++++++++++ include/linux/iommu.h | 44 ++++++++++++++++++++++++++------------- 3 files changed, 73 insertions(+), 33 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index bf91e014d1791..3b58244e6344a 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -102,6 +103,9 @@ static int __init iommu_dma_forcedac_setup(char *str) } early_param("iommu.forcedac", iommu_dma_forcedac_setup); +static int iommu_dma_sw_msi(struct iommu_domain *domain, struct msi_desc *desc, + phys_addr_t msi_addr); + /* Number of entries per flush queue */ #define IOVA_DEFAULT_FQ_SIZE 256 #define IOVA_SINGLE_FQ_SIZE 32768 @@ -398,6 +402,7 @@ int iommu_get_dma_cookie(struct iommu_domain *domain) return -ENOMEM; mutex_init(&domain->iova_cookie->mutex); + iommu_domain_set_sw_msi(domain, iommu_dma_sw_msi); return 0; } @@ -429,6 +434,7 @@ int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base) cookie->msi_iova = base; domain->iova_cookie = cookie; + iommu_domain_set_sw_msi(domain, iommu_dma_sw_msi); return 0; } EXPORT_SYMBOL(iommu_get_msi_cookie); @@ -443,6 +449,9 @@ void iommu_put_dma_cookie(struct iommu_domain *domain) struct iommu_dma_cookie *cookie = domain->iova_cookie; struct iommu_dma_msi_page *msi, *tmp; + if (domain->sw_msi != iommu_dma_sw_msi) + return; + if (!cookie) return; @@ -1800,33 +1809,19 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev, return NULL; } -/** - * iommu_dma_prepare_msi() - Map the MSI page in the IOMMU domain - * @desc: MSI descriptor, will store the MSI page - * @msi_addr: MSI target address to be mapped - * - * Return: 0 on success or negative error code if the mapping failed. - */ -int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) +static int iommu_dma_sw_msi(struct iommu_domain *domain, struct msi_desc *desc, + phys_addr_t msi_addr) { struct device *dev = msi_desc_to_dev(desc); - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); - struct iommu_dma_msi_page *msi_page; - static DEFINE_MUTEX(msi_prepare_lock); /* see below */ + const struct iommu_dma_msi_page *msi_page; - if (!domain || !domain->iova_cookie) { + if (!domain->iova_cookie) { msi_desc_set_iommu_msi_iova(desc, 0, 0); return 0; } - /* - * In fact the whole prepare operation should already be serialised by - * irq_domain_mutex further up the callchain, but that's pretty subtle - * on its own, so consider this locking as failsafe documentation... - */ - mutex_lock(&msi_prepare_lock); + iommu_group_mutex_assert(dev); msi_page = iommu_dma_get_msi_page(dev, msi_addr, domain); - mutex_unlock(&msi_prepare_lock); if (!msi_page) return -ENOMEM; diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 870c3cdbd0f62..022bf96a18c5e 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -3596,3 +3596,32 @@ int iommu_replace_group_handle(struct iommu_group *group, return ret; } EXPORT_SYMBOL_NS_GPL(iommu_replace_group_handle, "IOMMUFD_INTERNAL"); + +#if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU) +/** + * iommu_dma_prepare_msi() - Map the MSI page in the IOMMU domain + * @desc: MSI descriptor, will store the MSI page + * @msi_addr: MSI target address to be mapped + * + * The implementation of sw_msi() should take msi_addr and map it to + * an IOVA in the domain and call msi_desc_set_iommu_msi_iova() with the + * mapping information. + * + * Return: 0 on success or negative error code if the mapping failed. + */ +int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) +{ + struct device *dev = msi_desc_to_dev(desc); + struct iommu_group *group = dev->iommu_group; + int ret = 0; + + if (!group) + return 0; + + mutex_lock(&group->mutex); + if (group->domain && group->domain->sw_msi) + ret = group->domain->sw_msi(group->domain, desc, msi_addr); + mutex_unlock(&group->mutex); + return ret; +} +#endif /* CONFIG_IRQ_MSI_IOMMU */ diff --git a/include/linux/iommu.h b/include/linux/iommu.h index caee952febd4f..761c5e186de96 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -44,6 +44,8 @@ struct iommu_dma_cookie; struct iommu_fault_param; struct iommufd_ctx; struct iommufd_viommu; +struct msi_desc; +struct msi_msg; #define IOMMU_FAULT_PERM_READ (1 << 0) /* read */ #define IOMMU_FAULT_PERM_WRITE (1 << 1) /* write */ @@ -216,6 +218,12 @@ struct iommu_domain { struct iommu_domain_geometry geometry; struct iommu_dma_cookie *iova_cookie; int (*iopf_handler)(struct iopf_group *group); + +#if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU) + int (*sw_msi)(struct iommu_domain *domain, struct msi_desc *desc, + phys_addr_t msi_addr); +#endif + void *fault_data; union { struct { @@ -234,6 +242,16 @@ struct iommu_domain { }; }; +static inline void iommu_domain_set_sw_msi( + struct iommu_domain *domain, + int (*sw_msi)(struct iommu_domain *domain, struct msi_desc *desc, + phys_addr_t msi_addr)) +{ +#if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU) + domain->sw_msi = sw_msi; +#endif +} + static inline bool iommu_is_dma_domain(struct iommu_domain *domain) { return domain->type & __IOMMU_DOMAIN_DMA_API; @@ -1470,6 +1488,18 @@ static inline ioasid_t iommu_alloc_global_pasid(struct device *dev) static inline void iommu_free_global_pasid(ioasid_t pasid) {} #endif /* CONFIG_IOMMU_API */ +#ifdef CONFIG_IRQ_MSI_IOMMU +#ifdef CONFIG_IOMMU_API +int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr); +#else +static inline int iommu_dma_prepare_msi(struct msi_desc *desc, + phys_addr_t msi_addr) +{ + return 0; +} +#endif /* CONFIG_IOMMU_API */ +#endif /* CONFIG_IRQ_MSI_IOMMU */ + #if IS_ENABLED(CONFIG_LOCKDEP) && IS_ENABLED(CONFIG_IOMMU_API) void iommu_group_mutex_assert(struct device *dev); #else @@ -1503,26 +1533,12 @@ static inline void iommu_debugfs_setup(void) {} #endif #ifdef CONFIG_IOMMU_DMA -#include - int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base); - -int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr); - #else /* CONFIG_IOMMU_DMA */ - -struct msi_desc; -struct msi_msg; - static inline int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base) { return -ENODEV; } - -static inline int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) -{ - return 0; -} #endif /* CONFIG_IOMMU_DMA */ /* From 96093fe54f4864b07013cf61b847708233832841 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 19 Feb 2025 17:31:39 -0800 Subject: [PATCH 11/54] irqchip: Have CONFIG_IRQ_MSI_IOMMU be selected by irqchips that need it Currently, IRQ_MSI_IOMMU is selected if DMA_IOMMU is available to provide an implementation for iommu_dma_prepare/compose_msi_msg(). However, it'll make more sense for irqchips that call prepare/compose to select it, and that will trigger all the additional code and data to be compiled into the kernel. If IRQ_MSI_IOMMU is selected with no IOMMU side implementation, then the prepare/compose() will be NOP stubs. If IRQ_MSI_IOMMU is not selected by an irqchip, then the related code on the iommu side is compiled out. Link: https://patch.msgid.link/r/a2620f67002c5cdf974e89ca3bf905f5c0817be6.1740014950.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen Reviewed-by: Thomas Gleixner Signed-off-by: Jason Gunthorpe --- drivers/iommu/Kconfig | 1 - drivers/iommu/dma-iommu.c | 2 ++ drivers/irqchip/Kconfig | 4 ++++ kernel/irq/Kconfig | 1 + 4 files changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index ec1b5e32b9725..5124e7431fe31 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -154,7 +154,6 @@ config IOMMU_DMA select DMA_OPS_HELPERS select IOMMU_API select IOMMU_IOVA - select IRQ_MSI_IOMMU select NEED_SG_DMA_LENGTH select NEED_SG_DMA_FLAGS if SWIOTLB diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 3b58244e6344a..94263ed2c5644 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -449,8 +449,10 @@ void iommu_put_dma_cookie(struct iommu_domain *domain) struct iommu_dma_cookie *cookie = domain->iova_cookie; struct iommu_dma_msi_page *msi, *tmp; +#if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU) if (domain->sw_msi != iommu_dma_sw_msi) return; +#endif if (!cookie) return; diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index c11b9965c4ad9..64658a1c3aa18 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -28,6 +28,7 @@ config ARM_GIC_V2M select ARM_GIC select IRQ_MSI_LIB select PCI_MSI + select IRQ_MSI_IOMMU config GIC_NON_BANKED bool @@ -38,12 +39,14 @@ config ARM_GIC_V3 select PARTITION_PERCPU select GENERIC_IRQ_EFFECTIVE_AFF_MASK if SMP select HAVE_ARM_SMCCC_DISCOVERY + select IRQ_MSI_IOMMU config ARM_GIC_V3_ITS bool select GENERIC_MSI_IRQ select IRQ_MSI_LIB default ARM_GIC_V3 + select IRQ_MSI_IOMMU config ARM_GIC_V3_ITS_FSL_MC bool @@ -408,6 +411,7 @@ config LS_EXTIRQ config LS_SCFG_MSI def_bool y if SOC_LS1021A || ARCH_LAYERSCAPE + select IRQ_MSI_IOMMU depends on PCI_MSI config PARTITION_PERCPU diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 5432418c0feaf..9636aed20401b 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -100,6 +100,7 @@ config GENERIC_MSI_IRQ bool select IRQ_DOMAIN_HIERARCHY +# irqchip drivers should select this if they call iommu_dma_prepare_msi() config IRQ_MSI_IOMMU bool From 748706d7ca06012621b32851b68136bf33613b9e Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 19 Feb 2025 17:31:40 -0800 Subject: [PATCH 12/54] iommu: Turn fault_data to iommufd private pointer A "fault_data" was added exclusively for the iommufd_fault_iopf_handler() used by IOPF/PRI use cases, along with the attach_handle. Now, the iommufd version of the sw_msi function will reuse the attach_handle and fault_data for a non-fault case. Rename "fault_data" to "iommufd_hwpt" so as not to confine it to a "fault" case. Move it into a union to be the iommufd private pointer. A following patch will move the iova_cookie to the union for dma-iommu too after the iommufd_sw_msi implementation is added. Since we have two unions now, add some simple comments for readability. Link: https://patch.msgid.link/r/ee5039503f28a16590916e9eef28b917e2d1607a.1740014950.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/fault.c | 2 +- drivers/iommu/iommufd/hw_pagetable.c | 2 +- include/linux/iommu.h | 6 ++++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/iommufd/fault.c b/drivers/iommu/iommufd/fault.c index 931a3fbe6e32c..c48d72c9668ca 100644 --- a/drivers/iommu/iommufd/fault.c +++ b/drivers/iommu/iommufd/fault.c @@ -329,7 +329,7 @@ int iommufd_fault_iopf_handler(struct iopf_group *group) struct iommufd_hw_pagetable *hwpt; struct iommufd_fault *fault; - hwpt = group->attach_handle->domain->fault_data; + hwpt = group->attach_handle->domain->iommufd_hwpt; fault = hwpt->fault; spin_lock(&fault->lock); diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 598be26a14e28..2641d50f46cf1 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -406,10 +406,10 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) } hwpt->fault = fault; hwpt->domain->iopf_handler = iommufd_fault_iopf_handler; - hwpt->domain->fault_data = hwpt; refcount_inc(&fault->obj.users); iommufd_put_object(ucmd->ictx, &fault->obj); } + hwpt->domain->iommufd_hwpt = hwpt; cmd->out_hwpt_id = hwpt->obj.id; rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 761c5e186de96..e93d2e918599f 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -224,8 +224,10 @@ struct iommu_domain { phys_addr_t msi_addr); #endif - void *fault_data; - union { + union { /* Pointer usable by owner of the domain */ + struct iommufd_hw_pagetable *iommufd_hwpt; /* iommufd */ + }; + union { /* Fault handler */ struct { iommu_fault_handler_t handler; void *handler_token; From 40f5175d0eb77f902ba8e2a5df2a8f3a218c8843 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 19 Feb 2025 17:31:41 -0800 Subject: [PATCH 13/54] iommufd: Implement sw_msi support natively iommufd has a model where the iommu_domain can be changed while the VFIO device is attached. In this case, the MSI should continue to work. This corner case has not worked because the dma-iommu implementation of sw_msi is tied to a single domain. Implement the sw_msi mapping directly and use a global per-fd table to associate assigned IOVA to the MSI pages. This allows the MSI pages to be loaded into a domain before it is attached ensuring that MSI is not disrupted. Link: https://patch.msgid.link/r/e13d23eeacd67c0a692fc468c85b483f4dd51c57.1740014950.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 161 ++++++++++++++++++++---- drivers/iommu/iommufd/hw_pagetable.c | 3 + drivers/iommu/iommufd/iommufd_private.h | 23 +++- drivers/iommu/iommufd/main.c | 9 ++ 4 files changed, 173 insertions(+), 23 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 0786290b4056d..4e107f69f951a 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -5,6 +5,7 @@ #include #include #include +#include #include "../iommu-priv.h" #include "io_pagetable.h" @@ -293,36 +294,152 @@ u32 iommufd_device_to_id(struct iommufd_device *idev) } EXPORT_SYMBOL_NS_GPL(iommufd_device_to_id, "IOMMUFD"); +/* + * Get a iommufd_sw_msi_map for the msi physical address requested by the irq + * layer. The mapping to IOVA is global to the iommufd file descriptor, every + * domain that is attached to a device using the same MSI parameters will use + * the same IOVA. + */ +static __maybe_unused struct iommufd_sw_msi_map * +iommufd_sw_msi_get_map(struct iommufd_ctx *ictx, phys_addr_t msi_addr, + phys_addr_t sw_msi_start) +{ + struct iommufd_sw_msi_map *cur; + unsigned int max_pgoff = 0; + + lockdep_assert_held(&ictx->sw_msi_lock); + + list_for_each_entry(cur, &ictx->sw_msi_list, sw_msi_item) { + if (cur->sw_msi_start != sw_msi_start) + continue; + max_pgoff = max(max_pgoff, cur->pgoff + 1); + if (cur->msi_addr == msi_addr) + return cur; + } + + if (ictx->sw_msi_id >= + BITS_PER_BYTE * sizeof_field(struct iommufd_sw_msi_maps, bitmap)) + return ERR_PTR(-EOVERFLOW); + + cur = kzalloc(sizeof(*cur), GFP_KERNEL); + if (!cur) + return ERR_PTR(-ENOMEM); + + cur->sw_msi_start = sw_msi_start; + cur->msi_addr = msi_addr; + cur->pgoff = max_pgoff; + cur->id = ictx->sw_msi_id++; + list_add_tail(&cur->sw_msi_item, &ictx->sw_msi_list); + return cur; +} + +static int iommufd_sw_msi_install(struct iommufd_ctx *ictx, + struct iommufd_hwpt_paging *hwpt_paging, + struct iommufd_sw_msi_map *msi_map) +{ + unsigned long iova; + + lockdep_assert_held(&ictx->sw_msi_lock); + + iova = msi_map->sw_msi_start + msi_map->pgoff * PAGE_SIZE; + if (!test_bit(msi_map->id, hwpt_paging->present_sw_msi.bitmap)) { + int rc; + + rc = iommu_map(hwpt_paging->common.domain, iova, + msi_map->msi_addr, PAGE_SIZE, + IOMMU_WRITE | IOMMU_READ | IOMMU_MMIO, + GFP_KERNEL_ACCOUNT); + if (rc) + return rc; + __set_bit(msi_map->id, hwpt_paging->present_sw_msi.bitmap); + } + return 0; +} + +/* + * Called by the irq code if the platform translates the MSI address through the + * IOMMU. msi_addr is the physical address of the MSI page. iommufd will + * allocate a fd global iova for the physical page that is the same on all + * domains and devices. + */ +#ifdef CONFIG_IRQ_MSI_IOMMU +int iommufd_sw_msi(struct iommu_domain *domain, struct msi_desc *desc, + phys_addr_t msi_addr) +{ + struct device *dev = msi_desc_to_dev(desc); + struct iommufd_hwpt_paging *hwpt_paging; + struct iommu_attach_handle *raw_handle; + struct iommufd_attach_handle *handle; + struct iommufd_sw_msi_map *msi_map; + struct iommufd_ctx *ictx; + unsigned long iova; + int rc; + + /* + * It is safe to call iommu_attach_handle_get() here because the iommu + * core code invokes this under the group mutex which also prevents any + * change of the attach handle for the duration of this function. + */ + iommu_group_mutex_assert(dev); + + raw_handle = + iommu_attach_handle_get(dev->iommu_group, IOMMU_NO_PASID, 0); + if (IS_ERR(raw_handle)) + return 0; + hwpt_paging = find_hwpt_paging(domain->iommufd_hwpt); + + handle = to_iommufd_handle(raw_handle); + /* No IOMMU_RESV_SW_MSI means no change to the msi_msg */ + if (handle->idev->igroup->sw_msi_start == PHYS_ADDR_MAX) + return 0; + + ictx = handle->idev->ictx; + guard(mutex)(&ictx->sw_msi_lock); + /* + * The input msi_addr is the exact byte offset of the MSI doorbell, we + * assume the caller has checked that it is contained with a MMIO region + * that is secure to map at PAGE_SIZE. + */ + msi_map = iommufd_sw_msi_get_map(handle->idev->ictx, + msi_addr & PAGE_MASK, + handle->idev->igroup->sw_msi_start); + if (IS_ERR(msi_map)) + return PTR_ERR(msi_map); + + rc = iommufd_sw_msi_install(ictx, hwpt_paging, msi_map); + if (rc) + return rc; + __set_bit(msi_map->id, handle->idev->igroup->required_sw_msi.bitmap); + + iova = msi_map->sw_msi_start + msi_map->pgoff * PAGE_SIZE; + msi_desc_set_iommu_msi_iova(desc, iova, PAGE_SHIFT); + return 0; +} +#endif + static int iommufd_group_setup_msi(struct iommufd_group *igroup, struct iommufd_hwpt_paging *hwpt_paging) { - phys_addr_t sw_msi_start = igroup->sw_msi_start; - int rc; + struct iommufd_ctx *ictx = igroup->ictx; + struct iommufd_sw_msi_map *cur; + + if (igroup->sw_msi_start == PHYS_ADDR_MAX) + return 0; /* - * If the IOMMU driver gives a IOMMU_RESV_SW_MSI then it is asking us to - * call iommu_get_msi_cookie() on its behalf. This is necessary to setup - * the MSI window so iommu_dma_prepare_msi() can install pages into our - * domain after request_irq(). If it is not done interrupts will not - * work on this domain. - * - * FIXME: This is conceptually broken for iommufd since we want to allow - * userspace to change the domains, eg switch from an identity IOAS to a - * DMA IOAS. There is currently no way to create a MSI window that - * matches what the IRQ layer actually expects in a newly created - * domain. + * Install all the MSI pages the device has been using into the domain */ - if (sw_msi_start != PHYS_ADDR_MAX && !hwpt_paging->msi_cookie) { - rc = iommu_get_msi_cookie(hwpt_paging->common.domain, - sw_msi_start); + guard(mutex)(&ictx->sw_msi_lock); + list_for_each_entry(cur, &ictx->sw_msi_list, sw_msi_item) { + int rc; + + if (cur->sw_msi_start != igroup->sw_msi_start || + !test_bit(cur->id, igroup->required_sw_msi.bitmap)) + continue; + + rc = iommufd_sw_msi_install(ictx, hwpt_paging, cur); if (rc) return rc; - - /* - * iommu_get_msi_cookie() can only be called once per domain, - * it returns -EBUSY on later calls. - */ - hwpt_paging->msi_cookie = true; } return 0; } diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 2641d50f46cf1..7de6e914232e7 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -156,6 +156,7 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, goto out_abort; } } + iommu_domain_set_sw_msi(hwpt->domain, iommufd_sw_msi); /* * Set the coherency mode before we do iopt_table_add_domain() as some @@ -251,6 +252,7 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx, goto out_abort; } hwpt->domain->owner = ops; + iommu_domain_set_sw_msi(hwpt->domain, iommufd_sw_msi); if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) { rc = -EINVAL; @@ -307,6 +309,7 @@ iommufd_viommu_alloc_hwpt_nested(struct iommufd_viommu *viommu, u32 flags, goto out_abort; } hwpt->domain->owner = viommu->iommu_dev->ops; + iommu_domain_set_sw_msi(hwpt->domain, iommufd_sw_msi); if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) { rc = -EINVAL; diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 8e0e3ab647476..246297452a44e 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -19,6 +19,22 @@ struct iommu_group; struct iommu_option; struct iommufd_device; +struct iommufd_sw_msi_map { + struct list_head sw_msi_item; + phys_addr_t sw_msi_start; + phys_addr_t msi_addr; + unsigned int pgoff; + unsigned int id; +}; + +/* Bitmap of struct iommufd_sw_msi_map::id */ +struct iommufd_sw_msi_maps { + DECLARE_BITMAP(bitmap, 64); +}; + +int iommufd_sw_msi(struct iommu_domain *domain, struct msi_desc *desc, + phys_addr_t msi_addr); + struct iommufd_ctx { struct file *file; struct xarray objects; @@ -26,6 +42,10 @@ struct iommufd_ctx { wait_queue_head_t destroy_wait; struct rw_semaphore ioas_creation_lock; + struct mutex sw_msi_lock; + struct list_head sw_msi_list; + unsigned int sw_msi_id; + u8 account_mode; /* Compatibility with VFIO no iommu */ u8 no_iommu_mode; @@ -283,10 +303,10 @@ struct iommufd_hwpt_paging { struct iommufd_ioas *ioas; bool auto_domain : 1; bool enforce_cache_coherency : 1; - bool msi_cookie : 1; bool nest_parent : 1; /* Head at iommufd_ioas::hwpt_list */ struct list_head hwpt_item; + struct iommufd_sw_msi_maps present_sw_msi; }; struct iommufd_hwpt_nested { @@ -383,6 +403,7 @@ struct iommufd_group { struct iommu_group *group; struct iommufd_hw_pagetable *hwpt; struct list_head device_list; + struct iommufd_sw_msi_maps required_sw_msi; phys_addr_t sw_msi_start; }; diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index ccf616462a1cb..b6fa9fd11bc18 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -227,6 +227,8 @@ static int iommufd_fops_open(struct inode *inode, struct file *filp) xa_init(&ictx->groups); ictx->file = filp; init_waitqueue_head(&ictx->destroy_wait); + mutex_init(&ictx->sw_msi_lock); + INIT_LIST_HEAD(&ictx->sw_msi_list); filp->private_data = ictx; return 0; } @@ -234,6 +236,8 @@ static int iommufd_fops_open(struct inode *inode, struct file *filp) static int iommufd_fops_release(struct inode *inode, struct file *filp) { struct iommufd_ctx *ictx = filp->private_data; + struct iommufd_sw_msi_map *next; + struct iommufd_sw_msi_map *cur; struct iommufd_object *obj; /* @@ -262,6 +266,11 @@ static int iommufd_fops_release(struct inode *inode, struct file *filp) break; } WARN_ON(!xa_empty(&ictx->groups)); + + mutex_destroy(&ictx->sw_msi_lock); + list_for_each_entry_safe(cur, next, &ictx->sw_msi_list, sw_msi_item) + kfree(cur); + kfree(ictx); return 0; } From 237603a46abf9637f9b71c6225293fac2b4d6ef7 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Tue, 25 Feb 2025 17:18:46 -0800 Subject: [PATCH 14/54] iommu: Make @handle mandatory in iommu_{attach|replace}_group_handle() Caller of the two APIs always provide a valid handle, make @handle as mandatory parameter. Take this chance incoporate the handle->domain set under the protection of group->mutex in iommu_attach_group_handle(). Link: https://patch.msgid.link/r/20250226011849.5102-2-yi.l.liu@intel.com Reviewed-by: Jason Gunthorpe Reviewed-by: Nicolin Chen Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommu.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 022bf96a18c5e..81189895df37c 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -3511,10 +3511,11 @@ int iommu_attach_group_handle(struct iommu_domain *domain, { int ret; - if (handle) - handle->domain = domain; + if (!handle) + return -EINVAL; mutex_lock(&group->mutex); + handle->domain = domain; ret = xa_insert(&group->pasid_array, IOMMU_NO_PASID, handle, GFP_KERNEL); if (ret) goto err_unlock; @@ -3568,16 +3569,14 @@ int iommu_replace_group_handle(struct iommu_group *group, void *curr; int ret; - if (!new_domain) + if (!new_domain || !handle) return -EINVAL; mutex_lock(&group->mutex); - if (handle) { - ret = xa_reserve(&group->pasid_array, IOMMU_NO_PASID, GFP_KERNEL); - if (ret) - goto err_unlock; - handle->domain = new_domain; - } + handle->domain = new_domain; + ret = xa_reserve(&group->pasid_array, IOMMU_NO_PASID, GFP_KERNEL); + if (ret) + goto err_unlock; ret = __iommu_group_set_domain(group, new_domain); if (ret) From 473ec072a63370e37dddbadb2a7cc2419a0fdb28 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Tue, 25 Feb 2025 17:18:47 -0800 Subject: [PATCH 15/54] iommu: Drop iommu_group_replace_domain() iommufd does not use it now, so drop it. Link: https://patch.msgid.link/r/20250226011849.5102-3-yi.l.liu@intel.com Reviewed-by: Jason Gunthorpe Reviewed-by: Nicolin Chen Reviewed-by: Kevin Tian Signed-off-by: Yi Liu Reviewed-by: Lu Baolu Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommu-priv.h | 3 --- drivers/iommu/iommu.c | 35 ++++++----------------------------- 2 files changed, 6 insertions(+), 32 deletions(-) diff --git a/drivers/iommu/iommu-priv.h b/drivers/iommu/iommu-priv.h index de5b54eaa8bf1..b4508423e13ba 100644 --- a/drivers/iommu/iommu-priv.h +++ b/drivers/iommu/iommu-priv.h @@ -24,9 +24,6 @@ static inline const struct iommu_ops *iommu_fwspec_ops(struct iommu_fwspec *fwsp return iommu_ops_from_fwnode(fwspec ? fwspec->iommu_fwnode : NULL); } -int iommu_group_replace_domain(struct iommu_group *group, - struct iommu_domain *new_domain); - int iommu_device_register_bus(struct iommu_device *iommu, const struct iommu_ops *ops, const struct bus_type *bus, diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 81189895df37c..37c8cc6a9f79a 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2187,32 +2187,6 @@ int iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group) } EXPORT_SYMBOL_GPL(iommu_attach_group); -/** - * iommu_group_replace_domain - replace the domain that a group is attached to - * @group: IOMMU group that will be attached to the new domain - * @new_domain: new IOMMU domain to replace with - * - * This API allows the group to switch domains without being forced to go to - * the blocking domain in-between. - * - * If the currently attached domain is a core domain (e.g. a default_domain), - * it will act just like the iommu_attach_group(). - */ -int iommu_group_replace_domain(struct iommu_group *group, - struct iommu_domain *new_domain) -{ - int ret; - - if (!new_domain) - return -EINVAL; - - mutex_lock(&group->mutex); - ret = __iommu_group_set_domain(group, new_domain); - mutex_unlock(&group->mutex); - return ret; -} -EXPORT_SYMBOL_NS_GPL(iommu_group_replace_domain, "IOMMUFD_INTERNAL"); - static int __iommu_device_set_domain(struct iommu_group *group, struct device *dev, struct iommu_domain *new_domain, @@ -3558,9 +3532,12 @@ EXPORT_SYMBOL_NS_GPL(iommu_detach_group_handle, "IOMMUFD_INTERNAL"); * @new_domain: new IOMMU domain to replace with * @handle: attach handle * - * This is a variant of iommu_group_replace_domain(). It allows the caller to - * provide an attach handle for the new domain and use it when the domain is - * attached. + * This API allows the group to switch domains without being forced to go to + * the blocking domain in-between. It allows the caller to provide an attach + * handle for the new domain and use it when the domain is attached. + * + * If the currently attached domain is a core domain (e.g. a default_domain), + * it will act just like the iommu_attach_group_handle(). */ int iommu_replace_group_handle(struct iommu_group *group, struct iommu_domain *new_domain, From e1ea9d30d84c65e96eba27b240be5b6798350490 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Tue, 25 Feb 2025 17:18:48 -0800 Subject: [PATCH 16/54] iommu: Store either domain or handle in group->pasid_array iommu_attach_device_pasid() only stores handle to group->pasid_array when there is a valid handle input. However, it makes the iommu_attach_device_pasid() unable to detect if the pasid has been attached or not previously. To be complete, let the iommu_attach_device_pasid() store the domain to group->pasid_array if no valid handle. The other users of the group->pasid_array should be updated to be consistent. e.g. the iommu_attach_group_handle() and iommu_replace_group_handle(). Link: https://patch.msgid.link/r/20250226011849.5102-4-yi.l.liu@intel.com Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Nicolin Chen Reviewed-by: Kevin Tian Signed-off-by: Yi Liu Reviewed-by: Lu Baolu Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommu.c | 43 +++++++++++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 37c8cc6a9f79a..d3652518c7347 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -45,6 +45,9 @@ static unsigned int iommu_def_domain_type __read_mostly; static bool iommu_dma_strict __read_mostly = IS_ENABLED(CONFIG_IOMMU_DEFAULT_DMA_STRICT); static u32 iommu_cmd_line __read_mostly; +/* Tags used with xa_tag_pointer() in group->pasid_array */ +enum { IOMMU_PASID_ARRAY_DOMAIN = 0, IOMMU_PASID_ARRAY_HANDLE = 1 }; + struct iommu_group { struct kobject kobj; struct kobject *devices_kobj; @@ -2147,6 +2150,17 @@ struct iommu_domain *iommu_get_dma_domain(struct device *dev) return dev->iommu_group->default_domain; } +static void *iommu_make_pasid_array_entry(struct iommu_domain *domain, + struct iommu_attach_handle *handle) +{ + if (handle) { + handle->domain = domain; + return xa_tag_pointer(handle, IOMMU_PASID_ARRAY_HANDLE); + } + + return xa_tag_pointer(domain, IOMMU_PASID_ARRAY_DOMAIN); +} + static int __iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group) { @@ -3348,6 +3362,7 @@ int iommu_attach_device_pasid(struct iommu_domain *domain, struct iommu_group *group = dev->iommu_group; struct group_device *device; const struct iommu_ops *ops; + void *entry; int ret; if (!group) @@ -3371,10 +3386,9 @@ int iommu_attach_device_pasid(struct iommu_domain *domain, } } - if (handle) - handle->domain = domain; + entry = iommu_make_pasid_array_entry(domain, handle); - ret = xa_insert(&group->pasid_array, pasid, handle, GFP_KERNEL); + ret = xa_insert(&group->pasid_array, pasid, entry, GFP_KERNEL); if (ret) goto out_unlock; @@ -3454,13 +3468,17 @@ struct iommu_attach_handle * iommu_attach_handle_get(struct iommu_group *group, ioasid_t pasid, unsigned int type) { struct iommu_attach_handle *handle; + void *entry; xa_lock(&group->pasid_array); - handle = xa_load(&group->pasid_array, pasid); - if (!handle) + entry = xa_load(&group->pasid_array, pasid); + if (!entry || xa_pointer_tag(entry) != IOMMU_PASID_ARRAY_HANDLE) { handle = ERR_PTR(-ENOENT); - else if (type && handle->domain->type != type) - handle = ERR_PTR(-EBUSY); + } else { + handle = xa_untag_pointer(entry); + if (type && handle->domain->type != type) + handle = ERR_PTR(-EBUSY); + } xa_unlock(&group->pasid_array); return handle; @@ -3483,14 +3501,15 @@ int iommu_attach_group_handle(struct iommu_domain *domain, struct iommu_group *group, struct iommu_attach_handle *handle) { + void *entry; int ret; if (!handle) return -EINVAL; mutex_lock(&group->mutex); - handle->domain = domain; - ret = xa_insert(&group->pasid_array, IOMMU_NO_PASID, handle, GFP_KERNEL); + entry = iommu_make_pasid_array_entry(domain, handle); + ret = xa_insert(&group->pasid_array, IOMMU_NO_PASID, entry, GFP_KERNEL); if (ret) goto err_unlock; @@ -3543,14 +3562,14 @@ int iommu_replace_group_handle(struct iommu_group *group, struct iommu_domain *new_domain, struct iommu_attach_handle *handle) { - void *curr; + void *curr, *entry; int ret; if (!new_domain || !handle) return -EINVAL; mutex_lock(&group->mutex); - handle->domain = new_domain; + entry = iommu_make_pasid_array_entry(new_domain, handle); ret = xa_reserve(&group->pasid_array, IOMMU_NO_PASID, GFP_KERNEL); if (ret) goto err_unlock; @@ -3559,7 +3578,7 @@ int iommu_replace_group_handle(struct iommu_group *group, if (ret) goto err_release; - curr = xa_store(&group->pasid_array, IOMMU_NO_PASID, handle, GFP_KERNEL); + curr = xa_store(&group->pasid_array, IOMMU_NO_PASID, entry, GFP_KERNEL); WARN_ON(xa_is_err(curr)); mutex_unlock(&group->mutex); From 5e9f822c9c683ae884fa5e71df41d1647b2876c6 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Tue, 25 Feb 2025 17:18:49 -0800 Subject: [PATCH 17/54] iommu: Swap the order of setting group->pasid_array and calling attach op of iommu drivers The current implementation stores entry to the group->pasid_array before the underlying iommu driver has successfully set the new domain. This can lead to issues where PRIs are received on the new domain before the attach operation is completed. This patch swaps the order of operations to ensure that the domain is set in the underlying iommu driver before updating the group->pasid_array. Link: https://patch.msgid.link/r/20250226011849.5102-5-yi.l.liu@intel.com Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Nicolin Chen Reviewed-by: Lu Baolu Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommu.c | 48 ++++++++++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 12 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index d3652518c7347..0ee17893810fd 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -3388,13 +3388,29 @@ int iommu_attach_device_pasid(struct iommu_domain *domain, entry = iommu_make_pasid_array_entry(domain, handle); - ret = xa_insert(&group->pasid_array, pasid, entry, GFP_KERNEL); + /* + * Entry present is a failure case. Use xa_insert() instead of + * xa_reserve(). + */ + ret = xa_insert(&group->pasid_array, pasid, XA_ZERO_ENTRY, GFP_KERNEL); if (ret) goto out_unlock; ret = __iommu_set_group_pasid(domain, group, pasid); - if (ret) - xa_erase(&group->pasid_array, pasid); + if (ret) { + xa_release(&group->pasid_array, pasid); + goto out_unlock; + } + + /* + * The xa_insert() above reserved the memory, and the group->mutex is + * held, this cannot fail. The new domain cannot be visible until the + * operation succeeds as we cannot tolerate PRIs becoming concurrently + * queued and then failing attach. + */ + WARN_ON(xa_is_err(xa_store(&group->pasid_array, + pasid, entry, GFP_KERNEL))); + out_unlock: mutex_unlock(&group->mutex); return ret; @@ -3509,19 +3525,27 @@ int iommu_attach_group_handle(struct iommu_domain *domain, mutex_lock(&group->mutex); entry = iommu_make_pasid_array_entry(domain, handle); - ret = xa_insert(&group->pasid_array, IOMMU_NO_PASID, entry, GFP_KERNEL); + ret = xa_insert(&group->pasid_array, + IOMMU_NO_PASID, XA_ZERO_ENTRY, GFP_KERNEL); if (ret) - goto err_unlock; + goto out_unlock; ret = __iommu_attach_group(domain, group); - if (ret) - goto err_erase; - mutex_unlock(&group->mutex); + if (ret) { + xa_release(&group->pasid_array, IOMMU_NO_PASID); + goto out_unlock; + } - return 0; -err_erase: - xa_erase(&group->pasid_array, IOMMU_NO_PASID); -err_unlock: + /* + * The xa_insert() above reserved the memory, and the group->mutex is + * held, this cannot fail. The new domain cannot be visible until the + * operation succeeds as we cannot tolerate PRIs becoming concurrently + * queued and then failing attach. + */ + WARN_ON(xa_is_err(xa_store(&group->pasid_array, + IOMMU_NO_PASID, entry, GFP_KERNEL))); + +out_unlock: mutex_unlock(&group->mutex); return ret; } From 7506be7d253fa6cd08fc0f47f7dcd1bc841adb93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Barnab=C3=A1s=20Cz=C3=A9m=C3=A1n?= Date: Mon, 24 Feb 2025 02:56:19 +0100 Subject: [PATCH 18/54] dt-bindings: iommu: qcom,iommu: Add MSM8937 IOMMU to SMMUv1 compatibles MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add MSM8937 compatible string with "qcom,msm-iommu-v1" as fallback for the MSM8937 IOMMU which is compatible with Qualcomm's secure fw "SMMU v1" implementation. Reviewed-by: Krzysztof Kozlowski Signed-off-by: Barnabás Czémán Link: https://lore.kernel.org/r/20250224-msm8937-v3-4-dad7c182cccb@mainlining.org Signed-off-by: Will Deacon --- Documentation/devicetree/bindings/iommu/qcom,iommu.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/iommu/qcom,iommu.yaml b/Documentation/devicetree/bindings/iommu/qcom,iommu.yaml index 5ae9a628261fd..3e5623edd207a 100644 --- a/Documentation/devicetree/bindings/iommu/qcom,iommu.yaml +++ b/Documentation/devicetree/bindings/iommu/qcom,iommu.yaml @@ -22,6 +22,7 @@ properties: - enum: - qcom,msm8916-iommu - qcom,msm8917-iommu + - qcom,msm8937-iommu - qcom,msm8953-iommu - const: qcom,msm-iommu-v1 - items: From 032d7e435cbd81ffa37b846aea5a236f3f5912a3 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 26 Feb 2025 15:57:49 +0000 Subject: [PATCH 19/54] iommu/dma: Remove redundant locking This reverts commit ac9a5d522bb80be50ea84965699e1c8257d745ce. iommu_dma_init_domain() is now only called under the group mutex, as it should be given that that the default domain belongs to the group, so the additional internal locking is no longer needed. Signed-off-by: Robin Murphy Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/a943d4c198e6a1fffe998337d577dc3aa7f660a9.1740585469.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/dma-iommu.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 94263ed2c5644..0832998eca389 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -87,7 +87,6 @@ struct iommu_dma_cookie { struct iommu_domain *fq_domain; /* Options for dma-iommu use */ struct iommu_dma_options options; - struct mutex mutex; }; static DEFINE_STATIC_KEY_FALSE(iommu_deferred_attach_enabled); @@ -401,7 +400,6 @@ int iommu_get_dma_cookie(struct iommu_domain *domain) if (!domain->iova_cookie) return -ENOMEM; - mutex_init(&domain->iova_cookie->mutex); iommu_domain_set_sw_msi(domain, iommu_dma_sw_msi); return 0; } @@ -709,23 +707,20 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, struct device *dev domain->geometry.aperture_start >> order); /* start_pfn is always nonzero for an already-initialised domain */ - mutex_lock(&cookie->mutex); if (iovad->start_pfn) { if (1UL << order != iovad->granule || base_pfn != iovad->start_pfn) { pr_warn("Incompatible range for DMA domain\n"); - ret = -EFAULT; - goto done_unlock; + return -EFAULT; } - ret = 0; - goto done_unlock; + return 0; } init_iova_domain(iovad, 1UL << order, base_pfn); ret = iova_domain_init_rcaches(iovad); if (ret) - goto done_unlock; + return ret; iommu_dma_init_options(&cookie->options, dev); @@ -734,11 +729,7 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, struct device *dev (!device_iommu_capable(dev, IOMMU_CAP_DEFERRED_FLUSH) || iommu_dma_init_fq(domain))) domain->type = IOMMU_DOMAIN_DMA; - ret = iova_reserve_iommu_regions(dev, domain); - -done_unlock: - mutex_unlock(&cookie->mutex); - return ret; + return iova_reserve_iommu_regions(dev, domain); } /** From 558d2bbd457d513d9226f88fef0d9c782e595f6e Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Thu, 27 Feb 2025 16:23:14 +0000 Subject: [PATCH 20/54] iommu/amd: Log IOMMU control register in event log path Useful for debugging ILLEGAL_DEV_TABLE_ENTRY events as some of the DTE settings depends on Control register settings. Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20250227162320.5805-2-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/iommu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index b48a72bd7b23d..797c8612966a9 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -868,7 +868,7 @@ static void iommu_print_event(struct amd_iommu *iommu, void *__evt) int type, devid, flags, tag; volatile u32 *event = __evt; int count = 0; - u64 address; + u64 address, ctrl; u32 pasid; retry: @@ -878,6 +878,7 @@ static void iommu_print_event(struct amd_iommu *iommu, void *__evt) (event[1] & EVENT_DOMID_MASK_LO); flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK; address = (u64)(((u64)event[3]) << 32) | event[2]; + ctrl = readq(iommu->mmio_base + MMIO_CONTROL_OFFSET); if (type == 0) { /* Did we hit the erratum? */ @@ -899,6 +900,7 @@ static void iommu_print_event(struct amd_iommu *iommu, void *__evt) dev_err(dev, "Event logged [ILLEGAL_DEV_TABLE_ENTRY device=%04x:%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x]\n", iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), pasid, address, flags); + dev_err(dev, "Control Reg : 0x%llx\n", ctrl); dump_dte_entry(iommu, devid); break; case EVENT_TYPE_DEV_TAB_ERR: From e481f8a5db2e9d3a2fbb4872cc08ad11fbc1af68 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Thu, 27 Feb 2025 16:23:15 +0000 Subject: [PATCH 21/54] iommu/amd: Remove unused variable Remove 'amd_iommu_aperture_order' as its not used since commit d9cfed925448. Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20250227162320.5805-3-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/amd_iommu_types.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index 23caea22f8dcd..3f57bdd0a6598 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -928,9 +928,6 @@ struct unity_map_entry { * Data structures for device handling */ -/* size of the dma_ops aperture as power of 2 */ -extern unsigned amd_iommu_aperture_order; - extern bool amd_iommu_force_isolation; /* Max levels of glxval supported */ From 36a1cfd497435ba5e37572fe9463bb62a7b1b984 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Thu, 27 Feb 2025 16:23:16 +0000 Subject: [PATCH 22/54] iommu/amd/pgtbl_v2: Improve error handling Return -ENOMEM if v2_alloc_pte() fails to allocate memory. Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20250227162320.5805-4-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/io_pgtable_v2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/amd/io_pgtable_v2.c b/drivers/iommu/amd/io_pgtable_v2.c index c616de2c5926e..a56a273963059 100644 --- a/drivers/iommu/amd/io_pgtable_v2.c +++ b/drivers/iommu/amd/io_pgtable_v2.c @@ -254,7 +254,7 @@ static int iommu_v2_map_pages(struct io_pgtable_ops *ops, unsigned long iova, pte = v2_alloc_pte(cfg->amd.nid, pgtable->pgd, iova, map_size, gfp, &updated); if (!pte) { - ret = -EINVAL; + ret = -ENOMEM; goto out; } From 60785fceda6f4cbdee425d3320d0f541b166b5bf Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Thu, 27 Feb 2025 16:23:17 +0000 Subject: [PATCH 23/54] iommu/amd: Remove outdated comment Remove outdated comment. Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20250227162320.5805-5-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/io_pgtable.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c index f3399087859fd..26cf562dde11f 100644 --- a/drivers/iommu/amd/io_pgtable.c +++ b/drivers/iommu/amd/io_pgtable.c @@ -47,13 +47,6 @@ static u64 *first_pte_l7(u64 *pte, unsigned long *page_size, return fpte; } -/**************************************************************************** - * - * The functions below are used the create the page table mappings for - * unity mapped regions. - * - ****************************************************************************/ - static void free_pt_page(u64 *pt, struct list_head *freelist) { struct page *p = virt_to_page(pt); From ee4cf9260afe8e4be6b6d64f56fa7493d051d8de Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Thu, 27 Feb 2025 16:23:18 +0000 Subject: [PATCH 24/54] iommu/amd: Fix header file Move function declaration inside AMD_IOMMU_H defination. Fixes: fd5dff9de4be ("iommu/amd: Modify set_dte_entry() to use 256-bit DTE helpers") Fixes: 457da5764668 ("iommu/amd: Lock DTE before updating the entry with WRITE_ONCE()") Cc: Suravee Suthikulpanit Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20250227162320.5805-6-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/amd_iommu.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h index 68debf5ee2d75..e3bf27da1339e 100644 --- a/drivers/iommu/amd/amd_iommu.h +++ b/drivers/iommu/amd/amd_iommu.h @@ -176,12 +176,11 @@ void amd_iommu_apply_ivrs_quirks(void); #else static inline void amd_iommu_apply_ivrs_quirks(void) { } #endif +struct dev_table_entry *amd_iommu_get_ivhd_dte_flags(u16 segid, u16 devid); void amd_iommu_domain_set_pgtable(struct protection_domain *domain, u64 *root, int mode); struct dev_table_entry *get_dev_table(struct amd_iommu *iommu); - -#endif - -struct dev_table_entry *amd_iommu_get_ivhd_dte_flags(u16 segid, u16 devid); struct iommu_dev_data *search_dev_data(struct amd_iommu *iommu, u16 devid); + +#endif /* AMD_IOMMU_H */ From 5536e19e940be0aad326ff80870e0261c0ba5ff7 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Thu, 27 Feb 2025 16:23:19 +0000 Subject: [PATCH 25/54] iommu/amd: Remove unused forward declaration Remove unused forward declaration. Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20250227162320.5805-7-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/init.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index cb536d372b12e..94aaa4b2a9031 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -219,7 +219,6 @@ static bool __initdata cmdline_maps; static enum iommu_init_state init_state = IOMMU_START_STATE; static int amd_iommu_enable_interrupts(void); -static int __init iommu_go_to_state(enum iommu_init_state state); static void init_device_table_dma(struct amd_iommu_pci_seg *pci_seg); static bool amd_iommu_pre_enabled = true; From 625586855f002ea2dd4aa9552e3005745f285688 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Thu, 27 Feb 2025 16:23:20 +0000 Subject: [PATCH 26/54] iommu/amd: Consolidate protection domain free code Consolidate protection domain free code inside amd_iommu_domain_free() and remove protection_domain_free() function. Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20250227162320.5805-8-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/amd_iommu.h | 1 - drivers/iommu/amd/iommu.c | 15 +++++---------- drivers/iommu/amd/pasid.c | 2 +- 3 files changed, 6 insertions(+), 12 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h index e3bf27da1339e..220c598b7e148 100644 --- a/drivers/iommu/amd/amd_iommu.h +++ b/drivers/iommu/amd/amd_iommu.h @@ -47,7 +47,6 @@ extern unsigned long amd_iommu_pgsize_bitmap; /* Protection domain ops */ void amd_iommu_init_identity_domain(void); struct protection_domain *protection_domain_alloc(void); -void protection_domain_free(struct protection_domain *domain); struct iommu_domain *amd_iommu_domain_alloc_sva(struct device *dev, struct mm_struct *mm); void amd_iommu_domain_free(struct iommu_domain *dom); diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 797c8612966a9..534585ea09396 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2434,15 +2434,6 @@ static struct iommu_group *amd_iommu_device_group(struct device *dev) * *****************************************************************************/ -void protection_domain_free(struct protection_domain *domain) -{ - WARN_ON(!list_empty(&domain->dev_list)); - if (domain->domain.type & __IOMMU_DOMAIN_PAGING) - free_io_pgtable_ops(&domain->iop.pgtbl.ops); - pdom_id_free(domain->id); - kfree(domain); -} - static void protection_domain_init(struct protection_domain *domain) { spin_lock_init(&domain->lock); @@ -2580,7 +2571,11 @@ void amd_iommu_domain_free(struct iommu_domain *dom) { struct protection_domain *domain = to_pdomain(dom); - protection_domain_free(domain); + WARN_ON(!list_empty(&domain->dev_list)); + if (domain->domain.type & __IOMMU_DOMAIN_PAGING) + free_io_pgtable_ops(&domain->iop.pgtbl.ops); + pdom_id_free(domain->id); + kfree(domain); } static int blocked_domain_attach_device(struct iommu_domain *domain, diff --git a/drivers/iommu/amd/pasid.c b/drivers/iommu/amd/pasid.c index 11150cfd67182..77c8e9a91cbca 100644 --- a/drivers/iommu/amd/pasid.c +++ b/drivers/iommu/amd/pasid.c @@ -195,7 +195,7 @@ struct iommu_domain *amd_iommu_domain_alloc_sva(struct device *dev, ret = mmu_notifier_register(&pdom->mn, mm); if (ret) { - protection_domain_free(pdom); + amd_iommu_domain_free(&pdom->domain); return ERR_PTR(ret); } From 9d7b779e3004308d0ee23fc37da22aef011afb92 Mon Sep 17 00:00:00 2001 From: Hector Martin Date: Fri, 7 Mar 2025 21:07:45 +0100 Subject: [PATCH 27/54] iommu: apple-dart: Increase MAX_DARTS_PER_DEVICE to 3 ISP needs this. Signed-off-by: Hector Martin Reviewed-by: Alyssa Rosenzweig Signed-off-by: Sasha Finkelstein Reviewed-by: Sven Peter Link: https://lore.kernel.org/r/20250307-isp-dart-v3-1-684fe4489591@gmail.com Signed-off-by: Joerg Roedel --- drivers/iommu/apple-dart.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/apple-dart.c b/drivers/iommu/apple-dart.c index 95ba3caeb4017..edb2fb2233350 100644 --- a/drivers/iommu/apple-dart.c +++ b/drivers/iommu/apple-dart.c @@ -36,7 +36,7 @@ #define DART_MAX_STREAMS 256 #define DART_MAX_TTBR 4 -#define MAX_DARTS_PER_DEVICE 2 +#define MAX_DARTS_PER_DEVICE 3 /* Common registers */ From 3bc0102835f666d934addedbd37ff7f2f26c865d Mon Sep 17 00:00:00 2001 From: Hector Martin Date: Fri, 7 Mar 2025 21:07:46 +0100 Subject: [PATCH 28/54] iommu: apple-dart: Allow mismatched bypass support This is needed by ISP, which has DART0 with bypass and DART1/2 without. Signed-off-by: Hector Martin Reviewed-by: Alyssa Rosenzweig Signed-off-by: Sasha Finkelstein Reviewed-by: Sven Peter Link: https://lore.kernel.org/r/20250307-isp-dart-v3-2-684fe4489591@gmail.com Signed-off-by: Joerg Roedel --- drivers/iommu/apple-dart.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/apple-dart.c b/drivers/iommu/apple-dart.c index edb2fb2233350..13ccb801f52a4 100644 --- a/drivers/iommu/apple-dart.c +++ b/drivers/iommu/apple-dart.c @@ -277,6 +277,9 @@ struct apple_dart_domain { * @streams: streams for this device */ struct apple_dart_master_cfg { + /* Intersection of DART capabilitles */ + u32 supports_bypass : 1; + struct apple_dart_stream_map stream_maps[MAX_DARTS_PER_DEVICE]; }; @@ -684,7 +687,7 @@ static int apple_dart_attach_dev_identity(struct iommu_domain *domain, struct apple_dart_stream_map *stream_map; int i; - if (!cfg->stream_maps[0].dart->supports_bypass) + if (!cfg->supports_bypass) return -EINVAL; for_each_stream_map(i, cfg, stream_map) @@ -792,20 +795,24 @@ static int apple_dart_of_xlate(struct device *dev, return -EINVAL; sid = args->args[0]; - if (!cfg) + if (!cfg) { cfg = kzalloc(sizeof(*cfg), GFP_KERNEL); + + /* Will be ANDed with DART capabilities */ + cfg->supports_bypass = true; + } if (!cfg) return -ENOMEM; dev_iommu_priv_set(dev, cfg); cfg_dart = cfg->stream_maps[0].dart; if (cfg_dart) { - if (cfg_dart->supports_bypass != dart->supports_bypass) - return -EINVAL; if (cfg_dart->pgsize != dart->pgsize) return -EINVAL; } + cfg->supports_bypass &= dart->supports_bypass; + for (i = 0; i < MAX_DARTS_PER_DEVICE; ++i) { if (cfg->stream_maps[i].dart == dart) { set_bit(sid, cfg->stream_maps[i].sidmap); @@ -945,7 +952,7 @@ static int apple_dart_def_domain_type(struct device *dev) if (cfg->stream_maps[0].dart->pgsize > PAGE_SIZE) return IOMMU_DOMAIN_IDENTITY; - if (!cfg->stream_maps[0].dart->supports_bypass) + if (!cfg->supports_bypass) return IOMMU_DOMAIN_DMA; return 0; From 9ce7603ad3cbae64003087de57834e8c5c856a20 Mon Sep 17 00:00:00 2001 From: Yunhui Cui Date: Mon, 10 Mar 2025 10:47:44 +0800 Subject: [PATCH 29/54] iommu/vt-d: Fix system hang on reboot -f We found that executing the command ./a.out &;reboot -f (where a.out is a program that only executes a while(1) infinite loop) can probabilistically cause the system to hang in the intel_iommu_shutdown() function, rendering it unresponsive. Through analysis, we identified that the factors contributing to this issue are as follows: 1. The reboot -f command does not prompt the kernel to notify the application layer to perform cleanup actions, allowing the application to continue running. 2. When the kernel reaches the intel_iommu_shutdown() function, only the BSP (Bootstrap Processor) CPU is operational in the system. 3. During the execution of intel_iommu_shutdown(), the function down_write (&dmar_global_lock) causes the process to sleep and be scheduled out. 4. At this point, though the processor's interrupt flag is not cleared, allowing interrupts to be accepted. However, only legacy devices and NMI (Non-Maskable Interrupt) interrupts could come in, as other interrupts routing have already been disabled. If no legacy or NMI interrupts occur at this stage, the scheduler will not be able to run. 5. If the application got scheduled at this time is executing a while(1)- type loop, it will be unable to be preempted, leading to an infinite loop and causing the system to become unresponsive. To resolve this issue, the intel_iommu_shutdown() function should not execute down_write(), which can potentially cause the process to be scheduled out. Furthermore, since only the BSP is running during the later stages of the reboot, there is no need for protection against parallel access to the DMAR (DMA Remapping) unit. Therefore, the following lines could be removed: down_write(&dmar_global_lock); up_write(&dmar_global_lock); After testing, the issue has been resolved. Fixes: 6c3a44ed3c55 ("iommu/vt-d: Turn off translations at shutdown") Co-developed-by: Ethan Zhao Signed-off-by: Ethan Zhao Signed-off-by: Yunhui Cui Link: https://lore.kernel.org/r/20250303062421.17929-1-cuiyunhui@bytedance.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index cc46098f875b1..47c74b20d342c 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -2871,16 +2871,19 @@ void intel_iommu_shutdown(void) if (no_iommu || dmar_disabled) return; - down_write(&dmar_global_lock); + /* + * All other CPUs were brought down, hotplug interrupts were disabled, + * no lock and RCU checking needed anymore + */ + list_for_each_entry(drhd, &dmar_drhd_units, list) { + iommu = drhd->iommu; - /* Disable PMRs explicitly here. */ - for_each_iommu(iommu, drhd) + /* Disable PMRs explicitly here. */ iommu_disable_protect_mem_regions(iommu); - /* Make sure the IOMMUs are switched off */ - intel_disable_iommus(); - - up_write(&dmar_global_lock); + /* Make sure the IOMMUs are switched off */ + iommu_disable_translation(iommu); + } } static struct intel_iommu *dev_to_intel_iommu(struct device *dev) From a8653e5cc2047b6a3354dc072a9f1ebaff157eaa Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 10 Mar 2025 10:47:45 +0800 Subject: [PATCH 30/54] iommu/vt-d: Use virt_to_phys() If all the inlines are unwound virt_to_dma_pfn() is simply: return page_to_pfn(virt_to_page(p)) << (PAGE_SHIFT - VTD_PAGE_SHIFT); Which can be re-arranged to: (page_to_pfn(virt_to_page(p)) << PAGE_SHIFT) >> VTD_PAGE_SHIFT The only caller is: ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) re-arranged to: ((page_to_pfn(virt_to_page(tmp_page)) << PAGE_SHIFT) >> VTD_PAGE_SHIFT) << VTD_PAGE_SHIFT Which simplifies to: page_to_pfn(virt_to_page(tmp_page)) << PAGE_SHIFT That is the same as virt_to_phys(tmp_page), so just remove all of this. Reviewed-by: Lu Baolu Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/8-v3-e797f4dc6918+93057-iommu_pages_jgg@nvidia.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 3 ++- drivers/iommu/intel/iommu.h | 19 ------------------- 2 files changed, 2 insertions(+), 20 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 47c74b20d342c..0050f9cc37ac8 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -737,7 +737,8 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, return NULL; domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE); - pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE; + pteval = virt_to_phys(tmp_page) | DMA_PTE_READ | + DMA_PTE_WRITE; if (domain->use_first_level) pteval |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 6ea7bbe26b19d..dd980808998da 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -953,25 +953,6 @@ static inline unsigned long lvl_to_nr_pages(unsigned int lvl) return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH); } -/* VT-d pages must always be _smaller_ than MM pages. Otherwise things - are never going to work. */ -static inline unsigned long mm_to_dma_pfn_start(unsigned long mm_pfn) -{ - return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT); -} -static inline unsigned long mm_to_dma_pfn_end(unsigned long mm_pfn) -{ - return ((mm_pfn + 1) << (PAGE_SHIFT - VTD_PAGE_SHIFT)) - 1; -} -static inline unsigned long page_to_dma_pfn(struct page *pg) -{ - return mm_to_dma_pfn_start(page_to_pfn(pg)); -} -static inline unsigned long virt_to_dma_pfn(void *p) -{ - return page_to_dma_pfn(virt_to_page(p)); -} - static inline void context_set_present(struct context_entry *context) { context->lo |= 1; From 607ba1bb096110751f6aa4b46666e0ba024ab3c2 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 10 Mar 2025 10:47:46 +0800 Subject: [PATCH 31/54] iommu/vt-d: Check if SVA is supported when attaching the SVA domain Attach of a SVA domain should fail if SVA is not supported, move the check for SVA support out of IOMMU_DEV_FEAT_SVA and into attach. Also check when allocating a SVA domain to match other drivers. Signed-off-by: Jason Gunthorpe Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Yi Liu Tested-by: Zhangfei Gao Link: https://lore.kernel.org/r/20250228092631.3425464-3-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 37 +------------------------------ drivers/iommu/intel/svm.c | 43 +++++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 36 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 0050f9cc37ac8..743212b8d9a6c 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3855,41 +3855,6 @@ static struct iommu_group *intel_iommu_device_group(struct device *dev) return generic_device_group(dev); } -static int intel_iommu_enable_sva(struct device *dev) -{ - struct device_domain_info *info = dev_iommu_priv_get(dev); - struct intel_iommu *iommu; - - if (!info || dmar_disabled) - return -EINVAL; - - iommu = info->iommu; - if (!iommu) - return -EINVAL; - - if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE)) - return -ENODEV; - - if (!info->pasid_enabled || !info->ats_enabled) - return -EINVAL; - - /* - * Devices having device-specific I/O fault handling should not - * support PCI/PRI. The IOMMU side has no means to check the - * capability of device-specific IOPF. Therefore, IOMMU can only - * default that if the device driver enables SVA on a non-PRI - * device, it will handle IOPF in its own way. - */ - if (!info->pri_supported) - return 0; - - /* Devices supporting PRI should have it enabled. */ - if (!info->pri_enabled) - return -EINVAL; - - return 0; -} - static int context_flip_pri(struct device_domain_info *info, bool enable) { struct intel_iommu *iommu = info->iommu; @@ -4010,7 +3975,7 @@ intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat) return intel_iommu_enable_iopf(dev); case IOMMU_DEV_FEAT_SVA: - return intel_iommu_enable_sva(dev); + return 0; default: return -ENODEV; diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index f5569347591f2..ba93123cb4eba 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -110,6 +110,41 @@ static const struct mmu_notifier_ops intel_mmuops = { .free_notifier = intel_mm_free_notifier, }; +static int intel_iommu_sva_supported(struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu; + + if (!info || dmar_disabled) + return -EINVAL; + + iommu = info->iommu; + if (!iommu) + return -EINVAL; + + if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE)) + return -ENODEV; + + if (!info->pasid_enabled || !info->ats_enabled) + return -EINVAL; + + /* + * Devices having device-specific I/O fault handling should not + * support PCI/PRI. The IOMMU side has no means to check the + * capability of device-specific IOPF. Therefore, IOMMU can only + * default that if the device driver enables SVA on a non-PRI + * device, it will handle IOPF in its own way. + */ + if (!info->pri_supported) + return 0; + + /* Devices supporting PRI should have it enabled. */ + if (!info->pri_enabled) + return -EINVAL; + + return 0; +} + static int intel_svm_set_dev_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid, struct iommu_domain *old) @@ -121,6 +156,10 @@ static int intel_svm_set_dev_pasid(struct iommu_domain *domain, unsigned long sflags; int ret = 0; + ret = intel_iommu_sva_supported(dev); + if (ret) + return ret; + dev_pasid = domain_add_dev_pasid(domain, dev, pasid); if (IS_ERR(dev_pasid)) return PTR_ERR(dev_pasid); @@ -161,6 +200,10 @@ struct iommu_domain *intel_svm_domain_alloc(struct device *dev, struct dmar_domain *domain; int ret; + ret = intel_iommu_sva_supported(dev); + if (ret) + return ERR_PTR(ret); + domain = kzalloc(sizeof(*domain), GFP_KERNEL); if (!domain) return ERR_PTR(-ENOMEM); From 5518f239aff1baf772c5748da3add7243c5fb5df Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 10 Mar 2025 10:47:47 +0800 Subject: [PATCH 32/54] iommu/vt-d: Move scalable mode ATS enablement to probe path Device ATS is currently enabled when a domain is attached to the device and disabled when the domain is detached. This creates a limitation: when the IOMMU is operating in scalable mode and IOPF is enabled, the device's domain cannot be changed. The previous code enables ATS when a domain is set to a device's RID and disables it during RID domain switch. So, if a PASID is set with a domain requiring PRI, ATS should remain enabled until the domain is removed. During the PASID domain's lifecycle, if the RID's domain changes, PRI will be disrupted because it depends on ATS, which is disabled when the blocking domain is set for the device's RID. Remove this limitation by moving ATS enablement to the device probe path. Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Tested-by: Zhangfei Gao Link: https://lore.kernel.org/r/20250228092631.3425464-5-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 51 ++++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 24 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 743212b8d9a6c..83928038b9cb1 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1173,32 +1173,28 @@ static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev) return true; } -static void iommu_enable_pci_caps(struct device_domain_info *info) +static void iommu_enable_pci_ats(struct device_domain_info *info) { struct pci_dev *pdev; - if (!dev_is_pci(info->dev)) + if (!info->ats_supported) return; pdev = to_pci_dev(info->dev); - if (info->ats_supported && pci_ats_page_aligned(pdev) && - !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) + if (!pci_ats_page_aligned(pdev)) + return; + + if (!pci_enable_ats(pdev, VTD_PAGE_SHIFT)) info->ats_enabled = 1; } -static void iommu_disable_pci_caps(struct device_domain_info *info) +static void iommu_disable_pci_ats(struct device_domain_info *info) { - struct pci_dev *pdev; - - if (!dev_is_pci(info->dev)) + if (!info->ats_enabled) return; - pdev = to_pci_dev(info->dev); - - if (info->ats_enabled) { - pci_disable_ats(pdev); - info->ats_enabled = 0; - } + pci_disable_ats(to_pci_dev(info->dev)); + info->ats_enabled = 0; } static void intel_flush_iotlb_all(struct iommu_domain *domain) @@ -1557,12 +1553,19 @@ domain_context_mapping(struct dmar_domain *domain, struct device *dev) struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; u8 bus = info->bus, devfn = info->devfn; + int ret; if (!dev_is_pci(dev)) return domain_context_mapping_one(domain, iommu, bus, devfn); - return pci_for_each_dma_alias(to_pci_dev(dev), - domain_context_mapping_cb, domain); + ret = pci_for_each_dma_alias(to_pci_dev(dev), + domain_context_mapping_cb, domain); + if (ret) + return ret; + + iommu_enable_pci_ats(info); + + return 0; } /* Return largest possible superpage level for a given mapping */ @@ -1844,8 +1847,6 @@ static int dmar_domain_attach_device(struct dmar_domain *domain, if (ret) goto out_block_translation; - iommu_enable_pci_caps(info); - ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID); if (ret) goto out_block_translation; @@ -3202,6 +3203,7 @@ static void domain_context_clear(struct device_domain_info *info) pci_for_each_dma_alias(to_pci_dev(info->dev), &domain_context_clear_one_cb, info); + iommu_disable_pci_ats(info); } /* @@ -3218,7 +3220,6 @@ void device_block_translation(struct device *dev) if (info->domain) cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID); - iommu_disable_pci_caps(info); if (!dev_is_real_dma_subdevice(dev)) { if (sm_supported(iommu)) intel_pasid_tear_down_entry(iommu, dev, @@ -3753,6 +3754,9 @@ static struct iommu_device *intel_iommu_probe_device(struct device *dev) !pci_enable_pasid(pdev, info->pasid_supported & ~1)) info->pasid_enabled = 1; + if (sm_supported(iommu)) + iommu_enable_pci_ats(info); + return &iommu->iommu; free_table: intel_pasid_free_table(dev); @@ -3769,6 +3773,8 @@ static void intel_iommu_release_device(struct device *dev) struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; + iommu_disable_pci_ats(info); + if (info->pasid_enabled) { pci_disable_pasid(to_pci_dev(dev)); info->pasid_enabled = 0; @@ -4375,13 +4381,10 @@ static int identity_domain_attach_dev(struct iommu_domain *domain, struct device if (dev_is_real_dma_subdevice(dev)) return 0; - if (sm_supported(iommu)) { + if (sm_supported(iommu)) ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID); - if (!ret) - iommu_enable_pci_caps(info); - } else { + else ret = device_setup_pass_through(dev); - } return ret; } From 87caaba1d174d365a9717e01d2c42335ba1da7be Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 10 Mar 2025 10:47:48 +0800 Subject: [PATCH 33/54] iommu/vt-d: Move PRI enablement in probe path Update PRI enablement to use the new method, similar to the amd iommu driver. Enable PRI in the device probe path and disable it when the device is released. PRI is enabled throughout the device's iommu lifecycle. The infrastructure for the iommu subsystem to handle iopf requests is created during iopf enablement and released during iopf disablement. All invalid page requests from the device are automatically handled by the iommu subsystem if iopf is not enabled. Add iopf_refcount to track the iopf enablement. Convert the return type of intel_iommu_disable_iopf() to void, as there is no way to handle a failure when disabling this feature. Make intel_iommu_enable/disable_iopf() helpers global, as they will be used beyond the current file in the subsequent patch. The iopf_refcount is not protected by any lock. This is acceptable, as there is no concurrent access to it in the current code. The following patch will address this by moving it to the domain attach/detach paths, which are protected by the iommu group mutex. Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Tested-by: Zhangfei Gao Link: https://lore.kernel.org/r/20250228092631.3425464-6-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 137 +++++++++++++----------------------- drivers/iommu/intel/iommu.h | 4 ++ drivers/iommu/intel/pasid.c | 2 + drivers/iommu/intel/prq.c | 2 +- 4 files changed, 55 insertions(+), 90 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 83928038b9cb1..9ed8bdb3e9da6 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1197,6 +1197,37 @@ static void iommu_disable_pci_ats(struct device_domain_info *info) info->ats_enabled = 0; } +static void iommu_enable_pci_pri(struct device_domain_info *info) +{ + struct pci_dev *pdev; + + if (!info->ats_enabled || !info->pri_supported) + return; + + pdev = to_pci_dev(info->dev); + /* PASID is required in PRG Response Message. */ + if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev)) + return; + + if (pci_reset_pri(pdev)) + return; + + if (!pci_enable_pri(pdev, PRQ_DEPTH)) + info->pri_enabled = 1; +} + +static void iommu_disable_pci_pri(struct device_domain_info *info) +{ + if (!info->pri_enabled) + return; + + if (WARN_ON(info->iopf_refcount)) + iopf_queue_remove_device(info->iommu->iopf_queue, info->dev); + + pci_disable_pri(to_pci_dev(info->dev)); + info->pri_enabled = 0; +} + static void intel_flush_iotlb_all(struct iommu_domain *domain) { cache_tag_flush_all(to_dmar_domain(domain)); @@ -3756,6 +3787,7 @@ static struct iommu_device *intel_iommu_probe_device(struct device *dev) if (sm_supported(iommu)) iommu_enable_pci_ats(info); + iommu_enable_pci_pri(info); return &iommu->iommu; free_table: @@ -3773,6 +3805,7 @@ static void intel_iommu_release_device(struct device *dev) struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; + iommu_disable_pci_pri(info); iommu_disable_pci_ats(info); if (info->pasid_enabled) { @@ -3861,116 +3894,41 @@ static struct iommu_group *intel_iommu_device_group(struct device *dev) return generic_device_group(dev); } -static int context_flip_pri(struct device_domain_info *info, bool enable) -{ - struct intel_iommu *iommu = info->iommu; - u8 bus = info->bus, devfn = info->devfn; - struct context_entry *context; - u16 did; - - spin_lock(&iommu->lock); - if (context_copied(iommu, bus, devfn)) { - spin_unlock(&iommu->lock); - return -EINVAL; - } - - context = iommu_context_addr(iommu, bus, devfn, false); - if (!context || !context_present(context)) { - spin_unlock(&iommu->lock); - return -ENODEV; - } - did = context_domain_id(context); - - if (enable) - context_set_sm_pre(context); - else - context_clear_sm_pre(context); - - if (!ecap_coherent(iommu->ecap)) - clflush_cache_range(context, sizeof(*context)); - intel_context_flush_present(info, context, did, true); - spin_unlock(&iommu->lock); - - return 0; -} - -static int intel_iommu_enable_iopf(struct device *dev) +int intel_iommu_enable_iopf(struct device *dev) { - struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL; struct device_domain_info *info = dev_iommu_priv_get(dev); - struct intel_iommu *iommu; + struct intel_iommu *iommu = info->iommu; int ret; - if (!pdev || !info || !info->ats_enabled || !info->pri_supported) + if (!info->pri_enabled) return -ENODEV; - if (info->pri_enabled) - return -EBUSY; - - iommu = info->iommu; - if (!iommu) - return -EINVAL; - - /* PASID is required in PRG Response Message. */ - if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev)) - return -EINVAL; - - ret = pci_reset_pri(pdev); - if (ret) - return ret; + if (info->iopf_refcount) { + info->iopf_refcount++; + return 0; + } ret = iopf_queue_add_device(iommu->iopf_queue, dev); if (ret) return ret; - ret = context_flip_pri(info, true); - if (ret) - goto err_remove_device; - - ret = pci_enable_pri(pdev, PRQ_DEPTH); - if (ret) - goto err_clear_pri; - - info->pri_enabled = 1; + info->iopf_refcount = 1; return 0; -err_clear_pri: - context_flip_pri(info, false); -err_remove_device: - iopf_queue_remove_device(iommu->iopf_queue, dev); - - return ret; } -static int intel_iommu_disable_iopf(struct device *dev) +void intel_iommu_disable_iopf(struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; - if (!info->pri_enabled) - return -EINVAL; + if (WARN_ON(!info->pri_enabled || !info->iopf_refcount)) + return; - /* Disable new PRI reception: */ - context_flip_pri(info, false); + if (--info->iopf_refcount) + return; - /* - * Remove device from fault queue and acknowledge all outstanding - * PRQs to the device: - */ iopf_queue_remove_device(iommu->iopf_queue, dev); - - /* - * PCIe spec states that by clearing PRI enable bit, the Page - * Request Interface will not issue new page requests, but has - * outstanding page requests that have been transmitted or are - * queued for transmission. This is supposed to be called after - * the device driver has stopped DMA, all PASIDs have been - * unbound and the outstanding PRQs have been drained. - */ - pci_disable_pri(to_pci_dev(dev)); - info->pri_enabled = 0; - - return 0; } static int @@ -3993,7 +3951,8 @@ intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat) { switch (feat) { case IOMMU_DEV_FEAT_IOPF: - return intel_iommu_disable_iopf(dev); + intel_iommu_disable_iopf(dev); + return 0; case IOMMU_DEV_FEAT_SVA: return 0; diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index dd980808998da..42b4e500989b2 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -774,6 +774,7 @@ struct device_domain_info { u8 ats_enabled:1; u8 dtlb_extra_inval:1; /* Quirk for devices need extra flush */ u8 ats_qdep; + unsigned int iopf_refcount; struct device *dev; /* it's NULL for PCIe-to-PCI bridge */ struct intel_iommu *iommu; /* IOMMU used by this device */ struct dmar_domain *domain; /* pointer to domain */ @@ -1295,6 +1296,9 @@ void intel_iommu_page_response(struct device *dev, struct iopf_fault *evt, struct iommu_page_response *msg); void intel_iommu_drain_pasid_prq(struct device *dev, u32 pasid); +int intel_iommu_enable_iopf(struct device *dev); +void intel_iommu_disable_iopf(struct device *dev); + #ifdef CONFIG_INTEL_IOMMU_SVM void intel_svm_check(struct intel_iommu *iommu); struct iommu_domain *intel_svm_domain_alloc(struct device *dev, diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index fb59a7d35958f..c2742e256552a 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -992,6 +992,8 @@ static int context_entry_set_pasid_table(struct context_entry *context, context_set_sm_dte(context); if (info->pasid_supported) context_set_pasid(context); + if (info->pri_supported) + context_set_sm_pre(context); context_set_fault_enable(context); context_set_present(context); diff --git a/drivers/iommu/intel/prq.c b/drivers/iommu/intel/prq.c index 064194399b38b..5b6a64d968502 100644 --- a/drivers/iommu/intel/prq.c +++ b/drivers/iommu/intel/prq.c @@ -67,7 +67,7 @@ void intel_iommu_drain_pasid_prq(struct device *dev, u32 pasid) u16 sid, did; info = dev_iommu_priv_get(dev); - if (!info->pri_enabled) + if (!info->iopf_refcount) return; iommu = info->iommu; From 4c293add5874038dc82ef579663dd86744d8e872 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 10 Mar 2025 10:47:49 +0800 Subject: [PATCH 34/54] iommu/vt-d: Cleanup intel_context_flush_present() The intel_context_flush_present() is called in places where either the scalable mode is disabled, or scalable mode is enabled but all PASID entries are known to be non-present. In these cases, the flush_domains path within intel_context_flush_present() will never execute. This dead code is therefore removed. Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Tested-by: Zhangfei Gao Link: https://lore.kernel.org/r/20250228092631.3425464-7-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 2 +- drivers/iommu/intel/iommu.h | 5 ++--- drivers/iommu/intel/pasid.c | 41 +++++++------------------------------ 3 files changed, 10 insertions(+), 38 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 9ed8bdb3e9da6..072223fe6fe76 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1783,7 +1783,7 @@ static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 context_clear_entry(context); __iommu_flush_cache(iommu, context, sizeof(*context)); spin_unlock(&iommu->lock); - intel_context_flush_present(info, context, did, true); + intel_context_flush_no_pasid(info, context, did); } int __domain_setup_first_level(struct intel_iommu *iommu, diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 42b4e500989b2..c4916886da5a0 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -1286,9 +1286,8 @@ void cache_tag_flush_all(struct dmar_domain *domain); void cache_tag_flush_range_np(struct dmar_domain *domain, unsigned long start, unsigned long end); -void intel_context_flush_present(struct device_domain_info *info, - struct context_entry *context, - u16 did, bool affect_domains); +void intel_context_flush_no_pasid(struct device_domain_info *info, + struct context_entry *context, u16 did); int intel_iommu_enable_prq(struct intel_iommu *iommu); int intel_iommu_finish_prq(struct intel_iommu *iommu); diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index c2742e256552a..7ee18bb48bd46 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -932,7 +932,7 @@ static void device_pasid_table_teardown(struct device *dev, u8 bus, u8 devfn) context_clear_entry(context); __iommu_flush_cache(iommu, context, sizeof(*context)); spin_unlock(&iommu->lock); - intel_context_flush_present(info, context, did, false); + intel_context_flush_no_pasid(info, context, did); } static int pci_pasid_table_teardown(struct pci_dev *pdev, u16 alias, void *data) @@ -1119,17 +1119,15 @@ static void __context_flush_dev_iotlb(struct device_domain_info *info) /* * Cache invalidations after change in a context table entry that was present - * according to the Spec 6.5.3.3 (Guidance to Software for Invalidations). If - * IOMMU is in scalable mode and all PASID table entries of the device were - * non-present, set flush_domains to false. Otherwise, true. + * according to the Spec 6.5.3.3 (Guidance to Software for Invalidations). + * This helper can only be used when IOMMU is working in the legacy mode or + * IOMMU is in scalable mode but all PASID table entries of the device are + * non-present. */ -void intel_context_flush_present(struct device_domain_info *info, - struct context_entry *context, - u16 did, bool flush_domains) +void intel_context_flush_no_pasid(struct device_domain_info *info, + struct context_entry *context, u16 did) { struct intel_iommu *iommu = info->iommu; - struct pasid_entry *pte; - int i; /* * Device-selective context-cache invalidation. The Domain-ID field @@ -1152,30 +1150,5 @@ void intel_context_flush_present(struct device_domain_info *info, return; } - /* - * For scalable mode: - * - Domain-selective PASID-cache invalidation to affected domains - * - Domain-selective IOTLB invalidation to affected domains - * - Global Device-TLB invalidation to affected functions - */ - if (flush_domains) { - /* - * If the IOMMU is running in scalable mode and there might - * be potential PASID translations, the caller should hold - * the lock to ensure that context changes and cache flushes - * are atomic. - */ - assert_spin_locked(&iommu->lock); - for (i = 0; i < info->pasid_table->max_pasid; i++) { - pte = intel_pasid_get_entry(info->dev, i); - if (!pte || !pasid_pte_is_present(pte)) - continue; - - did = pasid_get_domain_id(pte); - qi_flush_pasid_cache(iommu, did, QI_PC_ALL_PASIDS, 0); - iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); - } - } - __context_flush_dev_iotlb(info); } From 29c6e1c2b923b43e8082bba5c6675185a8fe305a Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 27 Feb 2025 14:47:47 +0000 Subject: [PATCH 35/54] iommu: Unexport iommu_fwspec_free() The drivers doing their own fwspec parsing have no need to call iommu_fwspec_free() since fwspecs were moved into dev_iommu, as returning an error from .probe_device will tear down the whole lot anyway. Move it into the private interface now that it only serves for of_iommu to clean up in an error case. I have no idea what mtk_v1 was doing in effectively guaranteeing a NULL fwspec would be dereferenced if no "iommus" DT property was found, so add a check for that to at least make the code look sane. Signed-off-by: Robin Murphy Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/36e245489361de2d13db22a510fa5c79e7126278.1740667667.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/arm/arm-smmu/arm-smmu.c | 1 - drivers/iommu/iommu-priv.h | 2 ++ drivers/iommu/iommu.c | 1 - drivers/iommu/mtk_iommu_v1.c | 14 ++++---------- drivers/iommu/tegra-smmu.c | 1 - include/linux/iommu.h | 5 ----- 6 files changed, 6 insertions(+), 18 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index de205a34ffc6d..ea9f8e484e354 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -1486,7 +1486,6 @@ static struct iommu_device *arm_smmu_probe_device(struct device *dev) out_cfg_free: kfree(cfg); out_free: - iommu_fwspec_free(dev); return ERR_PTR(ret); } diff --git a/drivers/iommu/iommu-priv.h b/drivers/iommu/iommu-priv.h index b4508423e13ba..fed55fbfe99cb 100644 --- a/drivers/iommu/iommu-priv.h +++ b/drivers/iommu/iommu-priv.h @@ -24,6 +24,8 @@ static inline const struct iommu_ops *iommu_fwspec_ops(struct iommu_fwspec *fwsp return iommu_ops_from_fwnode(fwspec ? fwspec->iommu_fwnode : NULL); } +void iommu_fwspec_free(struct device *dev); + int iommu_device_register_bus(struct iommu_device *iommu, const struct iommu_ops *ops, const struct bus_type *bus, diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index d96cfb3af8a3f..ac4731d13a83e 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2837,7 +2837,6 @@ void iommu_fwspec_free(struct device *dev) dev_iommu_fwspec_set(dev, NULL); } } -EXPORT_SYMBOL_GPL(iommu_fwspec_free); int iommu_fwspec_add_ids(struct device *dev, const u32 *ids, int num_ids) { diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c index a565b9e40f4a6..3e724e7f10f02 100644 --- a/drivers/iommu/mtk_iommu_v1.c +++ b/drivers/iommu/mtk_iommu_v1.c @@ -446,22 +446,13 @@ static int mtk_iommu_v1_create_mapping(struct device *dev, static struct iommu_device *mtk_iommu_v1_probe_device(struct device *dev) { - struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); + struct iommu_fwspec *fwspec = NULL; struct of_phandle_args iommu_spec; struct mtk_iommu_v1_data *data; int err, idx = 0, larbid, larbidx; struct device_link *link; struct device *larbdev; - /* - * In the deferred case, free the existed fwspec. - * Always initialize the fwspec internally. - */ - if (fwspec) { - iommu_fwspec_free(dev); - fwspec = dev_iommu_fwspec_get(dev); - } - while (!of_parse_phandle_with_args(dev->of_node, "iommus", "#iommu-cells", idx, &iommu_spec)) { @@ -476,6 +467,9 @@ static struct iommu_device *mtk_iommu_v1_probe_device(struct device *dev) idx++; } + if (!fwspec) + return ERR_PTR(-ENODEV); + data = dev_iommu_priv_get(dev); /* Link the consumer device with the smi-larb device(supplier) */ diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c index 7f633bb5efef1..69d353e1df843 100644 --- a/drivers/iommu/tegra-smmu.c +++ b/drivers/iommu/tegra-smmu.c @@ -846,7 +846,6 @@ static int tegra_smmu_configure(struct tegra_smmu *smmu, struct device *dev, err = ops->of_xlate(dev, args); if (err < 0) { dev_err(dev, "failed to parse SW group ID: %d\n", err); - iommu_fwspec_free(dev); return err; } diff --git a/include/linux/iommu.h b/include/linux/iommu.h index e93d2e918599f..cf8c16ba04a08 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1099,7 +1099,6 @@ struct iommu_mm_data { }; int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode); -void iommu_fwspec_free(struct device *dev); int iommu_fwspec_add_ids(struct device *dev, const u32 *ids, int num_ids); static inline struct iommu_fwspec *dev_iommu_fwspec_get(struct device *dev) @@ -1410,10 +1409,6 @@ static inline int iommu_fwspec_init(struct device *dev, return -ENODEV; } -static inline void iommu_fwspec_free(struct device *dev) -{ -} - static inline int iommu_fwspec_add_ids(struct device *dev, u32 *ids, int num_ids) { From b46064a18810bad3aea089a79993ca5ea7a3d2b2 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Fri, 28 Feb 2025 15:46:30 +0000 Subject: [PATCH 36/54] iommu: Handle race with default domain setup It turns out that deferred default domain creation leaves a subtle race window during iommu_device_register() wherein a client driver may asynchronously probe in parallel and get as far as performing DMA API operations with dma-direct, only to be switched to iommu-dma underfoot once the default domain attachment finally happens, with obviously disastrous consequences. Even the wonky of_iommu_configure() path is at risk, since iommu_fwspec_init() will no longer defer client probe as the instance ops are (necessarily) already registered, and the "replay" iommu_probe_device() call can see dev->iommu_group already set and so think there's nothing to do either. Fortunately we already have the right tool in the right place in the form of iommu_device_use_default_domain(), which just needs to ensure that said default domain is actually ready to *be* used. Deferring the client probe shouldn't have too much impact, given that this only happens while the IOMMU driver is probing, and thus due to kick the deferred probe list again once it finishes. Reported-by: Charan Teja Kalla Fixes: 98ac73f99bc4 ("iommu: Require a default_domain for all iommu drivers") Reviewed-by: Jason Gunthorpe Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/e88b94c9b575034a2c98a48b3d383654cbda7902.1740753261.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index ac4731d13a83e..5b73d36e43591 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -3084,6 +3084,11 @@ int iommu_device_use_default_domain(struct device *dev) return 0; mutex_lock(&group->mutex); + /* We may race against bus_iommu_probe() finalising groups here */ + if (!group->default_domain) { + ret = -EPROBE_DEFER; + goto unlock_out; + } if (group->owner_cnt) { if (group->domain != group->default_domain || group->owner || !xa_empty(&group->pasid_array)) { From fd598f71b66975c042f40ae7cd4310d6e699e149 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Fri, 28 Feb 2025 15:46:31 +0000 Subject: [PATCH 37/54] iommu: Resolve ops in iommu_init_device() Since iommu_init_device() was factored out, it is in fact the only consumer of the ops which __iommu_probe_device() is resolving, so let it do that itself rather than passing them in. This also puts the ops lookup at a more logical point relative to the rest of the flow through __iommu_probe_device(). Signed-off-by: Robin Murphy Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/fa4b6cfc67a352488b7f4e0b736008307ce9ac2e.1740753261.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 5b73d36e43591..d26a459f29c32 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -407,14 +407,28 @@ EXPORT_SYMBOL_GPL(dev_iommu_priv_set); * Init the dev->iommu and dev->iommu_group in the struct device and get the * driver probed */ -static int iommu_init_device(struct device *dev, const struct iommu_ops *ops) +static int iommu_init_device(struct device *dev) { + const struct iommu_ops *ops; struct iommu_device *iommu_dev; struct iommu_group *group; int ret; if (!dev_iommu_get(dev)) return -ENOMEM; + /* + * For FDT-based systems and ACPI IORT/VIOT, drivers register IOMMU + * instances with non-NULL fwnodes, and client devices should have been + * identified with a fwspec by this point. Otherwise, we can currently + * assume that only one of Intel, AMD, s390, PAMU or legacy SMMUv2 can + * be present, and that any of their registered instances has suitable + * ops for probing, and thus cheekily co-opt the same mechanism. + */ + ops = iommu_fwspec_ops(dev->iommu->fwspec); + if (!ops) { + ret = -ENODEV; + goto err_free; + } if (!try_module_get(ops->owner)) { ret = -EINVAL; @@ -517,22 +531,10 @@ DEFINE_MUTEX(iommu_probe_device_lock); static int __iommu_probe_device(struct device *dev, struct list_head *group_list) { - const struct iommu_ops *ops; struct iommu_group *group; struct group_device *gdev; int ret; - /* - * For FDT-based systems and ACPI IORT/VIOT, drivers register IOMMU - * instances with non-NULL fwnodes, and client devices should have been - * identified with a fwspec by this point. Otherwise, we can currently - * assume that only one of Intel, AMD, s390, PAMU or legacy SMMUv2 can - * be present, and that any of their registered instances has suitable - * ops for probing, and thus cheekily co-opt the same mechanism. - */ - ops = iommu_fwspec_ops(dev_iommu_fwspec_get(dev)); - if (!ops) - return -ENODEV; /* * Serialise to avoid races between IOMMU drivers registering in * parallel and/or the "replay" calls from ACPI/OF code via client @@ -546,7 +548,7 @@ static int __iommu_probe_device(struct device *dev, struct list_head *group_list if (dev->iommu_group) return 0; - ret = iommu_init_device(dev, ops); + ret = iommu_init_device(dev); if (ret) return ret; From 3832862eb9c4dfa0e80b2522bfaedbc8a43de97d Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Fri, 28 Feb 2025 15:46:32 +0000 Subject: [PATCH 38/54] iommu: Keep dev->iommu state consistent At the moment, if of_iommu_configure() allocates dev->iommu itself via iommu_fwspec_init(), then suffers a DT parsing failure, it cleans up the fwspec but leaves the empty dev_iommu hanging around. So far this is benign (if a tiny bit wasteful), but we'd like to be able to reason about dev->iommu having a consistent and unambiguous lifecycle. Thus make sure that the of_iommu cleanup undoes precisely whatever it did. Signed-off-by: Robin Murphy Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/d219663a3f23001f23d520a883ac622d70b4e642.1740753261.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/iommu-priv.h | 2 ++ drivers/iommu/iommu.c | 2 +- drivers/iommu/of_iommu.c | 6 +++++- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/iommu-priv.h b/drivers/iommu/iommu-priv.h index fed55fbfe99cb..05fa6e682e88d 100644 --- a/drivers/iommu/iommu-priv.h +++ b/drivers/iommu/iommu-priv.h @@ -17,6 +17,8 @@ static inline const struct iommu_ops *dev_iommu_ops(struct device *dev) return dev->iommu->iommu_dev->ops; } +void dev_iommu_free(struct device *dev); + const struct iommu_ops *iommu_ops_from_fwnode(const struct fwnode_handle *fwnode); static inline const struct iommu_ops *iommu_fwspec_ops(struct iommu_fwspec *fwspec) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index d26a459f29c32..b0be5c168a435 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -355,7 +355,7 @@ static struct dev_iommu *dev_iommu_get(struct device *dev) return param; } -static void dev_iommu_free(struct device *dev) +void dev_iommu_free(struct device *dev) { struct dev_iommu *param = dev->iommu; diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c index 97987cd78da93..e10a68b5ffde1 100644 --- a/drivers/iommu/of_iommu.c +++ b/drivers/iommu/of_iommu.c @@ -116,6 +116,7 @@ static void of_pci_check_device_ats(struct device *dev, struct device_node *np) int of_iommu_configure(struct device *dev, struct device_node *master_np, const u32 *id) { + bool dev_iommu_present; int err; if (!master_np) @@ -127,6 +128,7 @@ int of_iommu_configure(struct device *dev, struct device_node *master_np, mutex_unlock(&iommu_probe_device_lock); return 0; } + dev_iommu_present = dev->iommu; /* * We don't currently walk up the tree looking for a parent IOMMU. @@ -147,8 +149,10 @@ int of_iommu_configure(struct device *dev, struct device_node *master_np, err = of_iommu_configure_device(master_np, dev, id); } - if (err) + if (err && dev_iommu_present) iommu_fwspec_free(dev); + else if (err && dev->iommu) + dev_iommu_free(dev); mutex_unlock(&iommu_probe_device_lock); if (!err && dev->bus) From bcb81ac6ae3c2ef95b44e7b54c3c9522364a245c Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Fri, 28 Feb 2025 15:46:33 +0000 Subject: [PATCH 39/54] iommu: Get DT/ACPI parsing into the proper probe path In hindsight, there were some crucial subtleties overlooked when moving {of,acpi}_dma_configure() to driver probe time to allow waiting for IOMMU drivers with -EPROBE_DEFER, and these have become an ever-increasing source of problems. The IOMMU API has some fundamental assumptions that iommu_probe_device() is called for every device added to the system, in the order in which they are added. Calling it in a random order or not at all dependent on driver binding leads to malformed groups, a potential lack of isolation for devices with no driver, and all manner of unexpected concurrency and race conditions. We've attempted to mitigate the latter with point-fix bodges like iommu_probe_device_lock, but it's a losing battle and the time has come to bite the bullet and address the true source of the problem instead. The crux of the matter is that the firmware parsing actually serves two distinct purposes; one is identifying the IOMMU instance associated with a device so we can check its availability, the second is actually telling that instance about the relevant firmware-provided data for the device. However the latter also depends on the former, and at the time there was no good place to defer and retry that separately from the availability check we also wanted for client driver probe. Nowadays, though, we have a proper notion of multiple IOMMU instances in the core API itself, and each one gets a chance to probe its own devices upon registration, so we can finally make that work as intended for DT/IORT/VIOT platforms too. All we need is for iommu_probe_device() to be able to run the iommu_fwspec machinery currently buried deep in the wrong end of {of,acpi}_dma_configure(). Luckily it turns out to be surprisingly straightforward to bootstrap this transformation by pretty much just calling the same path twice. At client driver probe time, dev->driver is obviously set; conversely at device_add(), or a subsequent bus_iommu_probe(), any device waiting for an IOMMU really should *not* have a driver already, so we can use that as a condition to disambiguate the two cases, and avoid recursing back into the IOMMU core at the wrong times. Obviously this isn't the nicest thing, but for now it gives us a functional baseline to then unpick the layers in between without many more awkward cross-subsystem patches. There are some minor side-effects like dma_range_map potentially being created earlier, and some debug prints being repeated, but these aren't significantly detrimental. Let's make things work first, then deal with making them nice. With the basic flow finally in the right order again, the next step is probably turning the bus->dma_configure paths inside-out, since all we really need from bus code is its notion of which device and input ID(s) to parse the common firmware properties with... Acked-by: Bjorn Helgaas # pci-driver.c Acked-by: Rob Herring (Arm) # of/device.c Signed-off-by: Robin Murphy Reviewed-by: Lorenzo Pieralisi Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/e3b191e6fd6ca9a1e84c5e5e40044faf97abb874.1740753261.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/acpi/arm64/dma.c | 5 +++++ drivers/acpi/scan.c | 7 ------- drivers/amba/bus.c | 3 ++- drivers/base/platform.c | 3 ++- drivers/bus/fsl-mc/fsl-mc-bus.c | 3 ++- drivers/cdx/cdx.c | 3 ++- drivers/iommu/iommu.c | 24 +++++++++++++++++++++--- drivers/iommu/of_iommu.c | 7 ++++++- drivers/of/device.c | 7 ++++++- drivers/pci/pci-driver.c | 3 ++- 10 files changed, 48 insertions(+), 17 deletions(-) diff --git a/drivers/acpi/arm64/dma.c b/drivers/acpi/arm64/dma.c index 52b2abf886898..f30f138352b7b 100644 --- a/drivers/acpi/arm64/dma.c +++ b/drivers/acpi/arm64/dma.c @@ -26,6 +26,11 @@ void acpi_arch_dma_setup(struct device *dev) else end = (1ULL << 32) - 1; + if (dev->dma_range_map) { + dev_dbg(dev, "dma_range_map already set\n"); + return; + } + ret = acpi_dma_get_range(dev, &map); if (!ret && map) { end = dma_range_map_max(map); diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index 9f4efa8f75a61..fb1fe9f3b1a36 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -1632,13 +1632,6 @@ static int acpi_iommu_configure_id(struct device *dev, const u32 *id_in) err = viot_iommu_configure(dev); mutex_unlock(&iommu_probe_device_lock); - /* - * If we have reason to believe the IOMMU driver missed the initial - * iommu_probe_device() call for dev, replay it to get things in order. - */ - if (!err && dev->bus) - err = iommu_probe_device(dev); - return err; } diff --git a/drivers/amba/bus.c b/drivers/amba/bus.c index 8ef259b4d0378..71482d639a6da 100644 --- a/drivers/amba/bus.c +++ b/drivers/amba/bus.c @@ -364,7 +364,8 @@ static int amba_dma_configure(struct device *dev) ret = acpi_dma_configure(dev, attr); } - if (!ret && !drv->driver_managed_dma) { + /* @drv may not be valid when we're called from the IOMMU layer */ + if (!ret && dev->driver && !drv->driver_managed_dma) { ret = iommu_device_use_default_domain(dev); if (ret) arch_teardown_dma_ops(dev); diff --git a/drivers/base/platform.c b/drivers/base/platform.c index 6f2a33722c520..1813cfd0c4bdf 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -1451,7 +1451,8 @@ static int platform_dma_configure(struct device *dev) attr = acpi_get_dma_attr(to_acpi_device_node(fwnode)); ret = acpi_dma_configure(dev, attr); } - if (ret || drv->driver_managed_dma) + /* @drv may not be valid when we're called from the IOMMU layer */ + if (ret || !dev->driver || drv->driver_managed_dma) return ret; ret = iommu_device_use_default_domain(dev); diff --git a/drivers/bus/fsl-mc/fsl-mc-bus.c b/drivers/bus/fsl-mc/fsl-mc-bus.c index d1f3d327ddd15..a8be8cf246fb6 100644 --- a/drivers/bus/fsl-mc/fsl-mc-bus.c +++ b/drivers/bus/fsl-mc/fsl-mc-bus.c @@ -153,7 +153,8 @@ static int fsl_mc_dma_configure(struct device *dev) else ret = acpi_dma_configure_id(dev, DEV_DMA_COHERENT, &input_id); - if (!ret && !mc_drv->driver_managed_dma) { + /* @mc_drv may not be valid when we're called from the IOMMU layer */ + if (!ret && dev->driver && !mc_drv->driver_managed_dma) { ret = iommu_device_use_default_domain(dev); if (ret) arch_teardown_dma_ops(dev); diff --git a/drivers/cdx/cdx.c b/drivers/cdx/cdx.c index c573ed2ee71a8..780fb0c4adbae 100644 --- a/drivers/cdx/cdx.c +++ b/drivers/cdx/cdx.c @@ -360,7 +360,8 @@ static int cdx_dma_configure(struct device *dev) return ret; } - if (!ret && !cdx_drv->driver_managed_dma) { + /* @cdx_drv may not be valid when we're called from the IOMMU layer */ + if (!ret && dev->driver && !cdx_drv->driver_managed_dma) { ret = iommu_device_use_default_domain(dev); if (ret) arch_teardown_dma_ops(dev); diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index b0be5c168a435..c283721579b33 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -417,9 +417,21 @@ static int iommu_init_device(struct device *dev) if (!dev_iommu_get(dev)) return -ENOMEM; /* - * For FDT-based systems and ACPI IORT/VIOT, drivers register IOMMU - * instances with non-NULL fwnodes, and client devices should have been - * identified with a fwspec by this point. Otherwise, we can currently + * For FDT-based systems and ACPI IORT/VIOT, the common firmware parsing + * is buried in the bus dma_configure path. Properly unpicking that is + * still a big job, so for now just invoke the whole thing. The device + * already having a driver bound means dma_configure has already run and + * either found no IOMMU to wait for, or we're in its replay call right + * now, so either way there's no point calling it again. + */ + if (!dev->driver && dev->bus->dma_configure) { + mutex_unlock(&iommu_probe_device_lock); + dev->bus->dma_configure(dev); + mutex_lock(&iommu_probe_device_lock); + } + /* + * At this point, relevant devices either now have a fwspec which will + * match ops registered with a non-NULL fwnode, or we can reasonably * assume that only one of Intel, AMD, s390, PAMU or legacy SMMUv2 can * be present, and that any of their registered instances has suitable * ops for probing, and thus cheekily co-opt the same mechanism. @@ -429,6 +441,12 @@ static int iommu_init_device(struct device *dev) ret = -ENODEV; goto err_free; } + /* + * And if we do now see any replay calls, they would indicate someone + * misusing the dma_configure path outside bus code. + */ + if (dev->driver) + dev_WARN(dev, "late IOMMU probe at driver bind, something fishy here!\n"); if (!try_module_get(ops->owner)) { ret = -EINVAL; diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c index e10a68b5ffde1..6b989a62def20 100644 --- a/drivers/iommu/of_iommu.c +++ b/drivers/iommu/of_iommu.c @@ -155,7 +155,12 @@ int of_iommu_configure(struct device *dev, struct device_node *master_np, dev_iommu_free(dev); mutex_unlock(&iommu_probe_device_lock); - if (!err && dev->bus) + /* + * If we're not on the iommu_probe_device() path (as indicated by the + * initial dev->iommu) then try to simulate it. This should no longer + * happen unless of_dma_configure() is being misused outside bus code. + */ + if (!err && dev->bus && !dev_iommu_present) err = iommu_probe_device(dev); if (err && err != -EPROBE_DEFER) diff --git a/drivers/of/device.c b/drivers/of/device.c index edf3be1972658..5053e5d532cc3 100644 --- a/drivers/of/device.c +++ b/drivers/of/device.c @@ -99,6 +99,11 @@ int of_dma_configure_id(struct device *dev, struct device_node *np, bool coherent, set_map = false; int ret; + if (dev->dma_range_map) { + dev_dbg(dev, "dma_range_map already set\n"); + goto skip_map; + } + if (np == dev->of_node) bus_np = __of_get_dma_parent(np); else @@ -119,7 +124,7 @@ int of_dma_configure_id(struct device *dev, struct device_node *np, end = dma_range_map_max(map); set_map = true; } - +skip_map: /* * If @dev is expected to be DMA-capable then the bus code that created * it should have initialised its dma_mask pointer by this point. For diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index f57ea36d125d6..4468b7327cabe 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -1653,7 +1653,8 @@ static int pci_dma_configure(struct device *dev) pci_put_host_bridge_device(bridge); - if (!ret && !driver->driver_managed_dma) { + /* @driver may not be valid when we're called from the IOMMU layer */ + if (!ret && dev->driver && !driver->driver_managed_dma) { ret = iommu_device_use_default_domain(dev); if (ret) arch_teardown_dma_ops(dev); From df90abbc31e69863dd98c5498a50811e10cdb4b3 Mon Sep 17 00:00:00 2001 From: Pratyush Brahma Date: Mon, 10 Mar 2025 16:48:34 +0530 Subject: [PATCH 40/54] dt-bindings: arm-smmu: Document QCS8300 GPU SMMU Add the compatible for Qualcomm QCS8300 GPU SMMU. Add the compatible in the list of clocks required by the GPU SMMU and remove it from the list of disallowed clocks. Reviewed-by: Krzysztof Kozlowski Signed-off-by: Pratyush Brahma Link: https://lore.kernel.org/r/20250310-b4-branch-gfx-smmu-v6-1-15c60b8abd99@quicinc.com Signed-off-by: Will Deacon --- Documentation/devicetree/bindings/iommu/arm,smmu.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/iommu/arm,smmu.yaml b/Documentation/devicetree/bindings/iommu/arm,smmu.yaml index 032fdc27127bf..7b9d5507d6ccd 100644 --- a/Documentation/devicetree/bindings/iommu/arm,smmu.yaml +++ b/Documentation/devicetree/bindings/iommu/arm,smmu.yaml @@ -90,6 +90,7 @@ properties: - enum: - qcom,qcm2290-smmu-500 - qcom,qcs615-smmu-500 + - qcom,qcs8300-smmu-500 - qcom,sa8255p-smmu-500 - qcom,sa8775p-smmu-500 - qcom,sar2130p-smmu-500 @@ -397,6 +398,7 @@ allOf: compatible: contains: enum: + - qcom,qcs8300-smmu-500 - qcom,sa8775p-smmu-500 - qcom,sc7280-smmu-500 - qcom,sc8280xp-smmu-500 @@ -581,7 +583,6 @@ allOf: - cavium,smmu-v2 - marvell,ap806-smmu-500 - nvidia,smmu-500 - - qcom,qcs8300-smmu-500 - qcom,qdu1000-smmu-500 - qcom,sa8255p-smmu-500 - qcom,sc7180-smmu-500 From 0a679336dc171db06d84679ae9180e5cda42166c Mon Sep 17 00:00:00 2001 From: Pranjal Shrivastava Date: Thu, 23 Jan 2025 19:56:36 +0000 Subject: [PATCH 41/54] iommu/arm-smmu: Set rpm auto_suspend once during probe The current code calls arm_smmu_rpm_use_autosuspend() during device attach, which seems unusual as it sets the autosuspend delay and the 'use_autosuspend' flag for the smmu device. These parameters can be simply set once during the smmu probe and in order to avoid bouncing rpm states, we can simply mark_last_busy() during a client dev attach as discussed in [1]. Move the handling of arm_smmu_rpm_use_autosuspend() to the SMMU probe and modify the arm_smmu_rpm_put() function to mark_last_busy() before calling __pm_runtime_put_autosuspend(). Additionally, s/pm_runtime_put_autosuspend/__pm_runtime_put_autosuspend/ to help with the refactor of the pm_runtime_put_autosuspend() API [2]. Link: https://lore.kernel.org/r/20241023164835.GF29251@willie-the-truck [1] Link: https://git.kernel.org/linus/b7d46644e554 [2] Signed-off-by: Pranjal Shrivastava Link: https://lore.kernel.org/r/20250123195636.4182099-1-praan@google.com Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu/arm-smmu.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index de205a34ffc6d..746187051e03f 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -79,8 +79,11 @@ static inline int arm_smmu_rpm_get(struct arm_smmu_device *smmu) static inline void arm_smmu_rpm_put(struct arm_smmu_device *smmu) { - if (pm_runtime_enabled(smmu->dev)) - pm_runtime_put_autosuspend(smmu->dev); + if (pm_runtime_enabled(smmu->dev)) { + pm_runtime_mark_last_busy(smmu->dev); + __pm_runtime_put_autosuspend(smmu->dev); + + } } static void arm_smmu_rpm_use_autosuspend(struct arm_smmu_device *smmu) @@ -1195,7 +1198,6 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) /* Looks ok, so add the device to the domain */ arm_smmu_master_install_s2crs(cfg, S2CR_TYPE_TRANS, smmu_domain->cfg.cbndx, fwspec); - arm_smmu_rpm_use_autosuspend(smmu); rpm_put: arm_smmu_rpm_put(smmu); return ret; @@ -1218,7 +1220,6 @@ static int arm_smmu_attach_dev_type(struct device *dev, return ret; arm_smmu_master_install_s2crs(cfg, type, 0, fwspec); - arm_smmu_rpm_use_autosuspend(smmu); arm_smmu_rpm_put(smmu); return 0; } @@ -2246,6 +2247,7 @@ static int arm_smmu_device_probe(struct platform_device *pdev) if (dev->pm_domain) { pm_runtime_set_active(dev); pm_runtime_enable(dev); + arm_smmu_rpm_use_autosuspend(smmu); } return 0; From 73d2f10957f517e5918c81b7e859e214ba71c044 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 12 Mar 2025 15:01:31 +0000 Subject: [PATCH 42/54] iommu: Don't warn prematurely about dodgy probes The warning for suspect probe conditions inadvertently got moved too early in a prior respin - it happened to work out OK for fwspecs, but in general still needs to be after the ops->probe_device call so drivers which filter devices for themselves have a chance to do that. Signed-off-by: Robin Murphy Fixes: bcb81ac6ae3c ("iommu: Get DT/ACPI parsing into the proper probe path") Link: https://lore.kernel.org/r/72a4853e7ef36e7c1c4ca171ed4ed8e1a463a61a.1741791691.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index c283721579b33..9e1b444246f8b 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -441,12 +441,6 @@ static int iommu_init_device(struct device *dev) ret = -ENODEV; goto err_free; } - /* - * And if we do now see any replay calls, they would indicate someone - * misusing the dma_configure path outside bus code. - */ - if (dev->driver) - dev_WARN(dev, "late IOMMU probe at driver bind, something fishy here!\n"); if (!try_module_get(ops->owner)) { ret = -EINVAL; @@ -569,6 +563,12 @@ static int __iommu_probe_device(struct device *dev, struct list_head *group_list ret = iommu_init_device(dev); if (ret) return ret; + /* + * And if we do now see any replay calls, they would indicate someone + * misusing the dma_configure path outside bus code. + */ + if (dev->driver) + dev_WARN(dev, "late IOMMU probe at driver bind, something fishy here!\n"); group = dev->iommu_group; gdev = iommu_group_alloc_device(group, dev); From 1c608b0b280d8a33edee28f4fb531151b47ec33c Mon Sep 17 00:00:00 2001 From: Sairaj Kodilkar Date: Fri, 7 Mar 2025 15:28:19 +0530 Subject: [PATCH 43/54] iommu/amd: Introduce generic function to set multibit feature value Define generic function `iommu_feature_set()` to set the values in the feature control register and replace `iommu_set_inv_tlb_timeout()` with it. Signed-off-by: Sairaj Kodilkar Reviewed-by: Vasant Hegde Link: https://lore.kernel.org/r/20250307095822.2274-2-sarunkod@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/amd_iommu_types.h | 2 +- drivers/iommu/amd/init.c | 27 ++++++++++----------------- 2 files changed, 11 insertions(+), 18 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index 3f57bdd0a6598..fe0672676551e 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -181,7 +181,7 @@ #define CONTROL_IRTCACHEDIS 59 #define CONTROL_SNPAVIC_EN 61 -#define CTRL_INV_TO_MASK (7 << CONTROL_INV_TIMEOUT) +#define CTRL_INV_TO_MASK 7 #define CTRL_INV_TO_NONE 0 #define CTRL_INV_TO_1MS 1 #define CTRL_INV_TO_10MS 2 diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 94aaa4b2a9031..2428d5c6c729b 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -411,33 +411,26 @@ static void iommu_set_device_table(struct amd_iommu *iommu) &entry, sizeof(entry)); } -/* Generic functions to enable/disable certain features of the IOMMU. */ -void iommu_feature_enable(struct amd_iommu *iommu, u8 bit) +static void iommu_feature_set(struct amd_iommu *iommu, u64 val, u64 mask, u8 shift) { u64 ctrl; ctrl = readq(iommu->mmio_base + MMIO_CONTROL_OFFSET); - ctrl |= (1ULL << bit); + mask <<= shift; + ctrl &= ~mask; + ctrl |= (val << shift) & mask; writeq(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); } -static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit) +/* Generic functions to enable/disable certain features of the IOMMU. */ +void iommu_feature_enable(struct amd_iommu *iommu, u8 bit) { - u64 ctrl; - - ctrl = readq(iommu->mmio_base + MMIO_CONTROL_OFFSET); - ctrl &= ~(1ULL << bit); - writeq(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); + iommu_feature_set(iommu, 1ULL, 1ULL, bit); } -static void iommu_set_inv_tlb_timeout(struct amd_iommu *iommu, int timeout) +static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit) { - u64 ctrl; - - ctrl = readq(iommu->mmio_base + MMIO_CONTROL_OFFSET); - ctrl &= ~CTRL_INV_TO_MASK; - ctrl |= (timeout << CONTROL_INV_TIMEOUT) & CTRL_INV_TO_MASK; - writeq(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); + iommu_feature_set(iommu, 0ULL, 1ULL, bit); } /* Function to enable the hardware */ @@ -2651,7 +2644,7 @@ static void iommu_init_flags(struct amd_iommu *iommu) iommu_feature_enable(iommu, CONTROL_COHERENT_EN); /* Set IOTLB invalidation timeout to 1s */ - iommu_set_inv_tlb_timeout(iommu, CTRL_INV_TO_1S); + iommu_feature_set(iommu, CTRL_INV_TO_1S, CTRL_INV_TO_MASK, CONTROL_INV_TIMEOUT); /* Enable Enhanced Peripheral Page Request Handling */ if (check_feature(FEATURE_EPHSUP)) From eaf717fa1c3f01ed7762f0733787b07d1ab6c970 Mon Sep 17 00:00:00 2001 From: Sairaj Kodilkar Date: Fri, 7 Mar 2025 15:28:20 +0530 Subject: [PATCH 44/54] iommu/amd: Replace slab cache allocator with page allocator Commit 05152a049444 ("iommu/amd: Add slab-cache for irq remapping tables") introduces slab cache allocator. But slab cache allocator provides benefit only when the allocation and deallocation of many identical objects is frequent. The AMD IOMMU driver allocates Interrupt remapping table (IRT) when device driver requests IRQ for the first time and never frees it. Hence the slab allocator does not provide any benefit here. Signed-off-by: Sairaj Kodilkar Reviewed-by: Vasant Hegde Link: https://lore.kernel.org/r/20250307095822.2274-3-sarunkod@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/amd_iommu_types.h | 4 ---- drivers/iommu/amd/init.c | 21 +-------------------- drivers/iommu/amd/iommu.c | 26 ++++++++++++++------------ 3 files changed, 15 insertions(+), 36 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index fe0672676551e..487fbe97aeaf0 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -313,7 +313,6 @@ * AMD IOMMU hardware only support 512 IRTEs despite * the architectural limitation of 2048 entries. */ -#define DTE_INTTAB_ALIGNMENT 128 #define DTE_INTTABLEN_VALUE 9ULL #define DTE_INTTABLEN (DTE_INTTABLEN_VALUE << 1) #define DTE_INTTABLEN_MASK (0xfULL << 1) @@ -492,9 +491,6 @@ extern const struct iommu_ops amd_iommu_ops; /* IVRS indicates that pre-boot remapping was enabled */ extern bool amdr_ivrs_remap_support; -/* kmem_cache to get tables with 128 byte alignement */ -extern struct kmem_cache *amd_iommu_irq_cache; - #define PCI_SBDF_TO_SEGID(sbdf) (((sbdf) >> 16) & 0xffff) #define PCI_SBDF_TO_DEVID(sbdf) ((sbdf) & 0xffff) #define PCI_SEG_DEVID_TO_SBDF(seg, devid) ((((u32)(seg) & 0xffff) << 16) | \ diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 2428d5c6c729b..1f305371c4c31 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include @@ -2931,9 +2930,6 @@ static struct syscore_ops amd_iommu_syscore_ops = { static void __init free_iommu_resources(void) { - kmem_cache_destroy(amd_iommu_irq_cache); - amd_iommu_irq_cache = NULL; - free_iommu_all(); free_pci_segments(); } @@ -3032,7 +3028,7 @@ static void __init ivinfo_init(void *ivrs) static int __init early_amd_iommu_init(void) { struct acpi_table_header *ivrs_base; - int remap_cache_sz, ret; + int ret; acpi_status status; if (!amd_iommu_detected) @@ -3094,22 +3090,7 @@ static int __init early_amd_iommu_init(void) if (amd_iommu_irq_remap) { struct amd_iommu_pci_seg *pci_seg; - /* - * Interrupt remapping enabled, create kmem_cache for the - * remapping tables. - */ ret = -ENOMEM; - if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir)) - remap_cache_sz = MAX_IRQS_PER_TABLE * sizeof(u32); - else - remap_cache_sz = MAX_IRQS_PER_TABLE * (sizeof(u64) * 2); - amd_iommu_irq_cache = kmem_cache_create("irq_remap_cache", - remap_cache_sz, - DTE_INTTAB_ALIGNMENT, - 0, NULL); - if (!amd_iommu_irq_cache) - goto out; - for_each_pci_segment(pci_seg) { if (alloc_irq_lookup_table(pci_seg)) goto out; diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 534585ea09396..3123b8fe7f8b6 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -75,8 +75,6 @@ struct iommu_cmd { */ DEFINE_IDA(pdom_ids); -struct kmem_cache *amd_iommu_irq_cache; - static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev); @@ -3118,7 +3116,7 @@ static struct irq_remap_table *get_irq_table(struct amd_iommu *iommu, u16 devid) return table; } -static struct irq_remap_table *__alloc_irq_table(void) +static struct irq_remap_table *__alloc_irq_table(int nid, int order) { struct irq_remap_table *table; @@ -3126,19 +3124,13 @@ static struct irq_remap_table *__alloc_irq_table(void) if (!table) return NULL; - table->table = kmem_cache_alloc(amd_iommu_irq_cache, GFP_KERNEL); + table->table = iommu_alloc_pages_node(nid, GFP_KERNEL, order); if (!table->table) { kfree(table); return NULL; } raw_spin_lock_init(&table->lock); - if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir)) - memset(table->table, 0, - MAX_IRQS_PER_TABLE * sizeof(u32)); - else - memset(table->table, 0, - (MAX_IRQS_PER_TABLE * (sizeof(u64) * 2))); return table; } @@ -3170,6 +3162,14 @@ static int set_remap_table_entry_alias(struct pci_dev *pdev, u16 alias, return 0; } +static inline size_t get_irq_table_size(unsigned int max_irqs) +{ + if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir)) + return max_irqs * sizeof(u32); + + return max_irqs * (sizeof(u64) * 2); +} + static struct irq_remap_table *alloc_irq_table(struct amd_iommu *iommu, u16 devid, struct pci_dev *pdev) { @@ -3177,6 +3177,8 @@ static struct irq_remap_table *alloc_irq_table(struct amd_iommu *iommu, struct irq_remap_table *new_table = NULL; struct amd_iommu_pci_seg *pci_seg; unsigned long flags; + int order = get_order(get_irq_table_size(MAX_IRQS_PER_TABLE)); + int nid = iommu && iommu->dev ? dev_to_node(&iommu->dev->dev) : NUMA_NO_NODE; u16 alias; spin_lock_irqsave(&iommu_table_lock, flags); @@ -3195,7 +3197,7 @@ static struct irq_remap_table *alloc_irq_table(struct amd_iommu *iommu, spin_unlock_irqrestore(&iommu_table_lock, flags); /* Nothing there yet, allocate new irq remapping table */ - new_table = __alloc_irq_table(); + new_table = __alloc_irq_table(nid, order); if (!new_table) return NULL; @@ -3230,7 +3232,7 @@ static struct irq_remap_table *alloc_irq_table(struct amd_iommu *iommu, spin_unlock_irqrestore(&iommu_table_lock, flags); if (new_table) { - kmem_cache_free(amd_iommu_irq_cache, new_table->table); + iommu_free_pages(new_table->table, order); kfree(new_table); } return table; From 950865c1b88abf190eaec5fa703e5fef05093e30 Mon Sep 17 00:00:00 2001 From: Sairaj Kodilkar Date: Fri, 7 Mar 2025 15:28:21 +0530 Subject: [PATCH 45/54] iommu/amd: Rename DTE_INTTABLEN* and MAX_IRQS_PER_TABLE macro AMD iommu can support both 512 and 2K interrupts on newer platform. Hence add suffix "512" to the existing macros. Signed-off-by: Sairaj Kodilkar Reviewed-by: Vasant Hegde Link: https://lore.kernel.org/r/20250307095822.2274-4-sarunkod@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/amd_iommu_types.h | 6 +++--- drivers/iommu/amd/init.c | 2 +- drivers/iommu/amd/iommu.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index 487fbe97aeaf0..433eca602a4ee 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -313,10 +313,10 @@ * AMD IOMMU hardware only support 512 IRTEs despite * the architectural limitation of 2048 entries. */ -#define DTE_INTTABLEN_VALUE 9ULL -#define DTE_INTTABLEN (DTE_INTTABLEN_VALUE << 1) #define DTE_INTTABLEN_MASK (0xfULL << 1) -#define MAX_IRQS_PER_TABLE (1 << DTE_INTTABLEN_VALUE) +#define DTE_INTTABLEN_VALUE_512 9ULL +#define DTE_INTTABLEN_512 (DTE_INTTABLEN_VALUE_512 << 1) +#define MAX_IRQS_PER_TABLE_512 BIT(DTE_INTTABLEN_VALUE_512) #define PAGE_MODE_NONE 0x00 #define PAGE_MODE_1_LEVEL 0x01 diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 1f305371c4c31..d91c04253fd3c 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -1060,7 +1060,7 @@ static bool __copy_device_table(struct amd_iommu *iommu) int_tab_len = old_devtb[devid].data[2] & DTE_INTTABLEN_MASK; if (irq_v && (int_ctl || int_tab_len)) { if ((int_ctl != DTE_IRQ_REMAP_INTCTL) || - (int_tab_len != DTE_INTTABLEN)) { + (int_tab_len != DTE_INTTABLEN_512)) { pr_err("Wrong old irq remapping flag: %#x\n", devid); memunmap(old_devtb); return false; diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 3123b8fe7f8b6..d4559d555fa0a 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -3090,7 +3090,7 @@ static void set_dte_irq_entry(struct amd_iommu *iommu, u16 devid, new &= ~DTE_IRQ_PHYS_ADDR_MASK; new |= iommu_virt_to_phys(table->table); new |= DTE_IRQ_REMAP_INTCTL; - new |= DTE_INTTABLEN; + new |= DTE_INTTABLEN_512; new |= DTE_IRQ_REMAP_ENABLE; WRITE_ONCE(dte->data[2], new); From 19e5cc156cbc32421eb8237615f246c08eea8284 Mon Sep 17 00:00:00 2001 From: Kishon Vijay Abraham I Date: Fri, 7 Mar 2025 15:28:22 +0530 Subject: [PATCH 46/54] iommu/amd: Enable support for up to 2K interrupts per function AMD IOMMU optionally supports up to 2K interrupts per function on newer platforms. Support for this feature is indicated through Extended Feature 2 Register (MMIO Offset 01A0h[NumIntRemapSup]). Allocate 2K IRTEs per device when this support is available. Co-developed-by: Sairaj Kodilkar Signed-off-by: Sairaj Kodilkar Signed-off-by: Kishon Vijay Abraham I Reviewed-by: Vasant Hegde Link: https://lore.kernel.org/r/20250307095822.2274-5-sarunkod@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/amd_iommu_types.h | 15 ++++++--- drivers/iommu/amd/init.c | 16 +++++++++- drivers/iommu/amd/iommu.c | 48 +++++++++++++++++++++++------ 3 files changed, 65 insertions(+), 14 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index 433eca602a4ee..5089b58e528a7 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -112,6 +112,10 @@ #define FEATURE_SNPAVICSUP_GAM(x) \ (FIELD_GET(FEATURE_SNPAVICSUP, x) == 0x1) +#define FEATURE_NUM_INT_REMAP_SUP GENMASK_ULL(9, 8) +#define FEATURE_NUM_INT_REMAP_SUP_2K(x) \ + (FIELD_GET(FEATURE_NUM_INT_REMAP_SUP, x) == 0x1) + /* Note: * The current driver only support 16-bit PASID. * Currently, hardware only implement upto 16-bit PASID @@ -175,6 +179,9 @@ #define CONTROL_GAM_EN 25 #define CONTROL_GALOG_EN 28 #define CONTROL_GAINT_EN 29 +#define CONTROL_NUM_INT_REMAP_MODE 43 +#define CONTROL_NUM_INT_REMAP_MODE_MASK 0x03 +#define CONTROL_NUM_INT_REMAP_MODE_2K 0x01 #define CONTROL_EPH_EN 45 #define CONTROL_XT_EN 50 #define CONTROL_INTCAPXT_EN 51 @@ -309,14 +316,13 @@ #define DTE_IRQ_REMAP_INTCTL (2ULL << 60) #define DTE_IRQ_REMAP_ENABLE 1ULL -/* - * AMD IOMMU hardware only support 512 IRTEs despite - * the architectural limitation of 2048 entries. - */ #define DTE_INTTABLEN_MASK (0xfULL << 1) #define DTE_INTTABLEN_VALUE_512 9ULL #define DTE_INTTABLEN_512 (DTE_INTTABLEN_VALUE_512 << 1) #define MAX_IRQS_PER_TABLE_512 BIT(DTE_INTTABLEN_VALUE_512) +#define DTE_INTTABLEN_VALUE_2K 11ULL +#define DTE_INTTABLEN_2K (DTE_INTTABLEN_VALUE_2K << 1) +#define MAX_IRQS_PER_TABLE_2K BIT(DTE_INTTABLEN_VALUE_2K) #define PAGE_MODE_NONE 0x00 #define PAGE_MODE_1_LEVEL 0x01 @@ -847,6 +853,7 @@ struct iommu_dev_data { struct device *dev; u16 devid; /* PCI Device ID */ + unsigned int max_irqs; /* Maximum IRQs supported by device */ u32 max_pasids; /* Max supported PASIDs */ u32 flags; /* Holds AMD_IOMMU_DEVICE_FLAG_<*> */ int ats_qdep; diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index d91c04253fd3c..dd9e26b7b7184 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -1060,7 +1060,8 @@ static bool __copy_device_table(struct amd_iommu *iommu) int_tab_len = old_devtb[devid].data[2] & DTE_INTTABLEN_MASK; if (irq_v && (int_ctl || int_tab_len)) { if ((int_ctl != DTE_IRQ_REMAP_INTCTL) || - (int_tab_len != DTE_INTTABLEN_512)) { + (int_tab_len != DTE_INTTABLEN_512 && + int_tab_len != DTE_INTTABLEN_2K)) { pr_err("Wrong old irq remapping flag: %#x\n", devid); memunmap(old_devtb); return false; @@ -2736,6 +2737,17 @@ static void iommu_enable_irtcachedis(struct amd_iommu *iommu) iommu->irtcachedis_enabled ? "disabled" : "enabled"); } +static void iommu_enable_2k_int(struct amd_iommu *iommu) +{ + if (!FEATURE_NUM_INT_REMAP_SUP_2K(amd_iommu_efr2)) + return; + + iommu_feature_set(iommu, + CONTROL_NUM_INT_REMAP_MODE_2K, + CONTROL_NUM_INT_REMAP_MODE_MASK, + CONTROL_NUM_INT_REMAP_MODE); +} + static void early_enable_iommu(struct amd_iommu *iommu) { iommu_disable(iommu); @@ -2748,6 +2760,7 @@ static void early_enable_iommu(struct amd_iommu *iommu) iommu_enable_ga(iommu); iommu_enable_xt(iommu); iommu_enable_irtcachedis(iommu); + iommu_enable_2k_int(iommu); iommu_enable(iommu); amd_iommu_flush_all_caches(iommu); } @@ -2804,6 +2817,7 @@ static void early_enable_iommus(void) iommu_enable_ga(iommu); iommu_enable_xt(iommu); iommu_enable_irtcachedis(iommu); + iommu_enable_2k_int(iommu); iommu_set_device_table(iommu); amd_iommu_flush_all_caches(iommu); } diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index d4559d555fa0a..02d016b04d3f9 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2394,8 +2394,14 @@ static struct iommu_device *amd_iommu_probe_device(struct device *dev) } out_err: + iommu_completion_wait(iommu); + if (FEATURE_NUM_INT_REMAP_SUP_2K(amd_iommu_efr2)) + dev_data->max_irqs = MAX_IRQS_PER_TABLE_2K; + else + dev_data->max_irqs = MAX_IRQS_PER_TABLE_512; + if (dev_is_pci(dev)) pci_prepare_ats(to_pci_dev(dev), PAGE_SHIFT); @@ -3076,6 +3082,13 @@ static void iommu_flush_irt_and_complete(struct amd_iommu *iommu, u16 devid) raw_spin_unlock_irqrestore(&iommu->lock, flags); } +static inline u8 iommu_get_int_tablen(struct iommu_dev_data *dev_data) +{ + if (dev_data && dev_data->max_irqs == MAX_IRQS_PER_TABLE_2K) + return DTE_INTTABLEN_2K; + return DTE_INTTABLEN_512; +} + static void set_dte_irq_entry(struct amd_iommu *iommu, u16 devid, struct irq_remap_table *table) { @@ -3090,7 +3103,7 @@ static void set_dte_irq_entry(struct amd_iommu *iommu, u16 devid, new &= ~DTE_IRQ_PHYS_ADDR_MASK; new |= iommu_virt_to_phys(table->table); new |= DTE_IRQ_REMAP_INTCTL; - new |= DTE_INTTABLEN_512; + new |= iommu_get_int_tablen(dev_data); new |= DTE_IRQ_REMAP_ENABLE; WRITE_ONCE(dte->data[2], new); @@ -3171,13 +3184,14 @@ static inline size_t get_irq_table_size(unsigned int max_irqs) } static struct irq_remap_table *alloc_irq_table(struct amd_iommu *iommu, - u16 devid, struct pci_dev *pdev) + u16 devid, struct pci_dev *pdev, + unsigned int max_irqs) { struct irq_remap_table *table = NULL; struct irq_remap_table *new_table = NULL; struct amd_iommu_pci_seg *pci_seg; unsigned long flags; - int order = get_order(get_irq_table_size(MAX_IRQS_PER_TABLE)); + int order = get_order(get_irq_table_size(max_irqs)); int nid = iommu && iommu->dev ? dev_to_node(&iommu->dev->dev) : NUMA_NO_NODE; u16 alias; @@ -3239,13 +3253,14 @@ static struct irq_remap_table *alloc_irq_table(struct amd_iommu *iommu, } static int alloc_irq_index(struct amd_iommu *iommu, u16 devid, int count, - bool align, struct pci_dev *pdev) + bool align, struct pci_dev *pdev, + unsigned long max_irqs) { struct irq_remap_table *table; int index, c, alignment = 1; unsigned long flags; - table = alloc_irq_table(iommu, devid, pdev); + table = alloc_irq_table(iommu, devid, pdev, max_irqs); if (!table) return -ENODEV; @@ -3256,7 +3271,7 @@ static int alloc_irq_index(struct amd_iommu *iommu, u16 devid, int count, /* Scan table for free entries */ for (index = ALIGN(table->min_index, alignment), c = 0; - index < MAX_IRQS_PER_TABLE;) { + index < max_irqs;) { if (!iommu->irte_ops->is_allocated(table, index)) { c += 1; } else { @@ -3526,6 +3541,14 @@ static void fill_msi_msg(struct msi_msg *msg, u32 index) msg->data = index; msg->address_lo = 0; msg->arch_addr_lo.base_address = X86_MSI_BASE_ADDRESS_LOW; + /* + * The struct msi_msg.dest_mode_logical is used to set the DM bit + * in MSI Message Address Register. For device w/ 2K int-remap support, + * this is bit must be set to 1 regardless of the actual destination + * mode, which is signified by the IRTE[DM]. + */ + if (FEATURE_NUM_INT_REMAP_SUP_2K(amd_iommu_efr2)) + msg->arch_addr_lo.dest_mode_logical = true; msg->address_hi = X86_MSI_BASE_ADDRESS_HIGH; } @@ -3588,6 +3611,8 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq, struct amd_ir_data *data = NULL; struct amd_iommu *iommu; struct irq_cfg *cfg; + struct iommu_dev_data *dev_data; + unsigned long max_irqs; int i, ret, devid, seg, sbdf; int index; @@ -3606,6 +3631,9 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq, if (!iommu) return -EINVAL; + dev_data = search_dev_data(iommu, devid); + max_irqs = dev_data ? dev_data->max_irqs : MAX_IRQS_PER_TABLE_512; + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); if (ret < 0) return ret; @@ -3613,7 +3641,7 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq, if (info->type == X86_IRQ_ALLOC_TYPE_IOAPIC) { struct irq_remap_table *table; - table = alloc_irq_table(iommu, devid, NULL); + table = alloc_irq_table(iommu, devid, NULL, max_irqs); if (table) { if (!table->min_index) { /* @@ -3634,9 +3662,11 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq, bool align = (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI); index = alloc_irq_index(iommu, devid, nr_irqs, align, - msi_desc_to_pci_dev(info->desc)); + msi_desc_to_pci_dev(info->desc), + max_irqs); } else { - index = alloc_irq_index(iommu, devid, nr_irqs, false, NULL); + index = alloc_irq_index(iommu, devid, nr_irqs, false, NULL, + max_irqs); } if (index < 0) { From ba40f9dc95b27df499ee683d94baa82bfe73b69e Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Tue, 11 Mar 2025 14:15:56 +0000 Subject: [PATCH 47/54] iommu/mediatek-v1: Support COMPILE_TEST Add COMPILE_TEST support to Mediatek v1 so I have less chance of breaking it again in future. This just needs the ARM dma-iommu API stubbing out for now. Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/a8d7c0b4393b360ce556f5f15e229d1e2fe73c83.1741702556.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/Kconfig | 3 +-- drivers/iommu/mtk_iommu_v1.c | 11 ++++++++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 5124e7431fe31..cd750f512deee 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -482,8 +482,7 @@ config MTK_IOMMU config MTK_IOMMU_V1 tristate "MediaTek IOMMU Version 1 (M4U gen1) Support" - depends on ARM - depends on ARCH_MEDIATEK || COMPILE_TEST + depends on (ARCH_MEDIATEK && ARM) || COMPILE_TEST select ARM_DMA_USE_IOMMU select IOMMU_API select MEMORY diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c index 3e724e7f10f02..66824982e05fb 100644 --- a/drivers/iommu/mtk_iommu_v1.c +++ b/drivers/iommu/mtk_iommu_v1.c @@ -27,11 +27,20 @@ #include #include #include -#include #include #include #include +#if defined(CONFIG_ARM) +#include +#else +#define arm_iommu_create_mapping(...) NULL +#define arm_iommu_attach_device(...) -ENODEV +struct dma_iommu_mapping { + struct iommu_domain *domain; +}; +#endif + #define REG_MMU_PT_BASE_ADDR 0x000 #define F_ALL_INVLD 0x2 From f48dcda8f6a48d4592cc74f944d65d6a09f17895 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 13 Mar 2025 17:31:04 +0000 Subject: [PATCH 48/54] iommu/rockchip: Allocate per-device data sensibly Now that DT-based probing is finally happening in the right order again, it reveals an issue in Rockchip's of_xlate, which can now be called during registration, but is using the global dma_dev which is only assigned later. However, this makes little sense when we're already looking up the correct IOMMU device, who should logically be the owner of the devm allocation anyway. Reported-by: Marek Szyprowski Tested-by: Marek Szyprowski Tested-by: Anders Roxell Fixes: bcb81ac6ae3c ("iommu: Get DT/ACPI parsing into the proper probe path") Signed-off-by: Robin Murphy Tested-by: Quentin Schulz Tested-by: Dang Huynh Reviewed-by: Nicolas Frattaroli Tested-by: Nicolas Frattaroli Link: https://lore.kernel.org/r/771e91cf16b3048e93f657153b76905665878fa2.1741886382.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/rockchip-iommu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c index 323cc665c3570..48826d1ccfd85 100644 --- a/drivers/iommu/rockchip-iommu.c +++ b/drivers/iommu/rockchip-iommu.c @@ -1148,12 +1148,12 @@ static int rk_iommu_of_xlate(struct device *dev, struct platform_device *iommu_dev; struct rk_iommudata *data; - data = devm_kzalloc(dma_dev, sizeof(*data), GFP_KERNEL); + iommu_dev = of_find_device_by_node(args->np); + + data = devm_kzalloc(&iommu_dev->dev, sizeof(*data), GFP_KERNEL); if (!data) return -ENOMEM; - iommu_dev = of_find_device_by_node(args->np); - data->iommu = platform_get_drvdata(iommu_dev); data->iommu->domain = &rk_identity_domain; dev_iommu_priv_set(dev, data); From f90aa59eb2999e4c33049f7e5679a6424da5c6b9 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 13 Mar 2025 17:31:05 +0000 Subject: [PATCH 49/54] iommu/rockchip: Register in a sensible order Currently Rockchip calls iommu_device_register() before it's finished setting up the hardware and driver state, and as such it now gets unhappy in various ways when registration starts working the way it was always intended to, and probing client devices straight away. Reorder the operations to ensure that what we're registering is a prepared and functional IOMMU instance. Fixes: bcb81ac6ae3c ("iommu: Get DT/ACPI parsing into the proper probe path") Signed-off-by: Robin Murphy Tested-by: Quentin Schulz Tested-by: Dang Huynh Reviewed-by: Nicolas Frattaroli Tested-by: Nicolas Frattaroli Link: https://lore.kernel.org/r/e69532f00bf49d98322b96788edb7e2e305e4006.1741886382.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/rockchip-iommu.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c index 48826d1ccfd85..9d2120a9232ac 100644 --- a/drivers/iommu/rockchip-iommu.c +++ b/drivers/iommu/rockchip-iommu.c @@ -1256,14 +1256,6 @@ static int rk_iommu_probe(struct platform_device *pdev) if (err) return err; - err = iommu_device_sysfs_add(&iommu->iommu, dev, NULL, dev_name(dev)); - if (err) - goto err_unprepare_clocks; - - err = iommu_device_register(&iommu->iommu, &rk_iommu_ops, dev); - if (err) - goto err_remove_sysfs; - /* * Use the first registered IOMMU device for domain to use with DMA * API, since a domain might not physically correspond to a single @@ -1290,12 +1282,19 @@ static int rk_iommu_probe(struct platform_device *pdev) dma_set_mask_and_coherent(dev, rk_ops->dma_bit_mask); + err = iommu_device_sysfs_add(&iommu->iommu, dev, NULL, dev_name(dev)); + if (err) + goto err_pm_disable; + + err = iommu_device_register(&iommu->iommu, &rk_iommu_ops, dev); + if (err) + goto err_remove_sysfs; + return 0; -err_pm_disable: - pm_runtime_disable(dev); err_remove_sysfs: iommu_device_sysfs_remove(&iommu->iommu); -err_unprepare_clocks: +err_pm_disable: + pm_runtime_disable(dev); clk_bulk_unprepare(iommu->num_clocks, iommu->clocks); return err; } From dcde1c4aa7ceec94e8557191d5789fd76df2f671 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 13 Mar 2025 17:31:06 +0000 Subject: [PATCH 50/54] iommu/rockchip: Retire global dma_dev workaround The global dma_dev trick was mostly because the old domain_alloc op provided no context, so no way to know which IOMMU was to own the pagetable, or if a suitable one even existed at all. In the new multi-instance world with domain_alloc_paging this is no longer a concern - now we know that the given device must be associated with a valid IOMMU instance which provided the op to call in the first place, and therefore that instance can and should be the pagetable owner. To avoid worrying about the lifetime and stability of the rk_domain->iommus list, and keep the lookups simple and efficient, we'll still stash a dma_dev pointer, but now it's accurately per-domain. Signed-off-by: Robin Murphy Tested-by: Quentin Schulz Tested-by: Dang Huynh Reviewed-by: Nicolas Frattaroli Tested-by: Nicolas Frattaroli Link: https://lore.kernel.org/r/25dc948a7d35c8142c5719ac22bc523f8524d006.1741886382.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/rockchip-iommu.c | 34 +++++++++++++--------------------- 1 file changed, 13 insertions(+), 21 deletions(-) diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c index 9d2120a9232ac..af4cc91b2bbfc 100644 --- a/drivers/iommu/rockchip-iommu.c +++ b/drivers/iommu/rockchip-iommu.c @@ -88,6 +88,7 @@ struct rk_iommu_domain { dma_addr_t dt_dma; spinlock_t iommus_lock; /* lock for iommus list */ spinlock_t dt_lock; /* lock for modifying page directory table */ + struct device *dma_dev; struct iommu_domain domain; }; @@ -123,7 +124,6 @@ struct rk_iommudata { struct rk_iommu *iommu; }; -static struct device *dma_dev; static const struct rk_iommu_ops *rk_ops; static struct iommu_domain rk_identity_domain; @@ -132,7 +132,7 @@ static inline void rk_table_flush(struct rk_iommu_domain *dom, dma_addr_t dma, { size_t size = count * sizeof(u32); /* count of u32 entry */ - dma_sync_single_for_device(dma_dev, dma, size, DMA_TO_DEVICE); + dma_sync_single_for_device(dom->dma_dev, dma, size, DMA_TO_DEVICE); } static struct rk_iommu_domain *to_rk_domain(struct iommu_domain *dom) @@ -734,9 +734,9 @@ static u32 *rk_dte_get_page_table(struct rk_iommu_domain *rk_domain, if (!page_table) return ERR_PTR(-ENOMEM); - pt_dma = dma_map_single(dma_dev, page_table, SPAGE_SIZE, DMA_TO_DEVICE); - if (dma_mapping_error(dma_dev, pt_dma)) { - dev_err(dma_dev, "DMA mapping error while allocating page table\n"); + pt_dma = dma_map_single(rk_domain->dma_dev, page_table, SPAGE_SIZE, DMA_TO_DEVICE); + if (dma_mapping_error(rk_domain->dma_dev, pt_dma)) { + dev_err(rk_domain->dma_dev, "DMA mapping error while allocating page table\n"); iommu_free_page(page_table); return ERR_PTR(-ENOMEM); } @@ -1051,9 +1051,7 @@ static int rk_iommu_attach_device(struct iommu_domain *domain, static struct iommu_domain *rk_iommu_domain_alloc_paging(struct device *dev) { struct rk_iommu_domain *rk_domain; - - if (!dma_dev) - return NULL; + struct rk_iommu *iommu; rk_domain = kzalloc(sizeof(*rk_domain), GFP_KERNEL); if (!rk_domain) @@ -1068,10 +1066,12 @@ static struct iommu_domain *rk_iommu_domain_alloc_paging(struct device *dev) if (!rk_domain->dt) goto err_free_domain; - rk_domain->dt_dma = dma_map_single(dma_dev, rk_domain->dt, + iommu = rk_iommu_from_dev(dev); + rk_domain->dma_dev = iommu->dev; + rk_domain->dt_dma = dma_map_single(rk_domain->dma_dev, rk_domain->dt, SPAGE_SIZE, DMA_TO_DEVICE); - if (dma_mapping_error(dma_dev, rk_domain->dt_dma)) { - dev_err(dma_dev, "DMA map error for DT\n"); + if (dma_mapping_error(rk_domain->dma_dev, rk_domain->dt_dma)) { + dev_err(rk_domain->dma_dev, "DMA map error for DT\n"); goto err_free_dt; } @@ -1105,13 +1105,13 @@ static void rk_iommu_domain_free(struct iommu_domain *domain) if (rk_dte_is_pt_valid(dte)) { phys_addr_t pt_phys = rk_ops->pt_address(dte); u32 *page_table = phys_to_virt(pt_phys); - dma_unmap_single(dma_dev, pt_phys, + dma_unmap_single(rk_domain->dma_dev, pt_phys, SPAGE_SIZE, DMA_TO_DEVICE); iommu_free_page(page_table); } } - dma_unmap_single(dma_dev, rk_domain->dt_dma, + dma_unmap_single(rk_domain->dma_dev, rk_domain->dt_dma, SPAGE_SIZE, DMA_TO_DEVICE); iommu_free_page(rk_domain->dt); @@ -1256,14 +1256,6 @@ static int rk_iommu_probe(struct platform_device *pdev) if (err) return err; - /* - * Use the first registered IOMMU device for domain to use with DMA - * API, since a domain might not physically correspond to a single - * IOMMU device.. - */ - if (!dma_dev) - dma_dev = &pdev->dev; - pm_runtime_enable(dev); for (i = 0; i < iommu->num_irq; i++) { From b8741496c058c6d65d09799081158d1593554638 Mon Sep 17 00:00:00 2001 From: Qasim Ijaz Date: Fri, 14 Mar 2025 23:01:02 +0000 Subject: [PATCH 51/54] iommu: apple-dart: fix potential null pointer deref If kzalloc() fails, accessing cfg->supports_bypass causes a null pointer dereference. Fix by checking for NULL immediately after allocation and returning -ENOMEM. Fixes: 3bc0102835f6 ("iommu: apple-dart: Allow mismatched bypass support") Signed-off-by: Qasim Ijaz Reviewed-by: Alyssa Rosenzweig Link: https://lore.kernel.org/r/20250314230102.11008-1-qasdev00@gmail.com Signed-off-by: Joerg Roedel --- drivers/iommu/apple-dart.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/apple-dart.c b/drivers/iommu/apple-dart.c index 13ccb801f52a4..e13501541fdd9 100644 --- a/drivers/iommu/apple-dart.c +++ b/drivers/iommu/apple-dart.c @@ -797,12 +797,11 @@ static int apple_dart_of_xlate(struct device *dev, if (!cfg) { cfg = kzalloc(sizeof(*cfg), GFP_KERNEL); - + if (!cfg) + return -ENOMEM; /* Will be ANDed with DART capabilities */ cfg->supports_bypass = true; } - if (!cfg) - return -ENOMEM; dev_iommu_priv_set(dev, cfg); cfg_dart = cfg->stream_maps[0].dart; From 2454823e97a63d85a6b215905f71e5a06324eab7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 19 Mar 2025 10:20:59 +0800 Subject: [PATCH 52/54] iommu/vt-d: Put IRTE back into posted MSI mode if vCPU posting is disabled Add a helper to take care of reconfiguring an IRTE to deliver IRQs to the host, i.e. not to a vCPU, and use the helper when an IRTE's vCPU affinity is nullified, i.e. when KVM puts an IRTE back into "host" mode. Because posted MSIs use an ephemeral IRTE, using modify_irte() puts the IRTE into full remapped mode, i.e. unintentionally disables posted MSIs on the IRQ. Fixes: ed1e48ea4370 ("iommu/vt-d: Enable posted mode for device MSIs") Cc: stable@vger.kernel.org Cc: Thomas Gleixner Cc: Jacob Pan Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20250315025135.2365846-2-seanjc@google.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/irq_remapping.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c index ad795c772f21b..c495b533103f3 100644 --- a/drivers/iommu/intel/irq_remapping.c +++ b/drivers/iommu/intel/irq_remapping.c @@ -1169,7 +1169,17 @@ static void intel_ir_reconfigure_irte_posted(struct irq_data *irqd) static inline void intel_ir_reconfigure_irte_posted(struct irq_data *irqd) {} #endif -static void intel_ir_reconfigure_irte(struct irq_data *irqd, bool force) +static void __intel_ir_reconfigure_irte(struct irq_data *irqd, bool force_host) +{ + struct intel_ir_data *ir_data = irqd->chip_data; + + if (ir_data->irq_2_iommu.posted_msi) + intel_ir_reconfigure_irte_posted(irqd); + else if (force_host || ir_data->irq_2_iommu.mode == IRQ_REMAPPING) + modify_irte(&ir_data->irq_2_iommu, &ir_data->irte_entry); +} + +static void intel_ir_reconfigure_irte(struct irq_data *irqd, bool force_host) { struct intel_ir_data *ir_data = irqd->chip_data; struct irte *irte = &ir_data->irte_entry; @@ -1182,10 +1192,7 @@ static void intel_ir_reconfigure_irte(struct irq_data *irqd, bool force) irte->vector = cfg->vector; irte->dest_id = IRTE_DEST(cfg->dest_apicid); - if (ir_data->irq_2_iommu.posted_msi) - intel_ir_reconfigure_irte_posted(irqd); - else if (force || ir_data->irq_2_iommu.mode == IRQ_REMAPPING) - modify_irte(&ir_data->irq_2_iommu, irte); + __intel_ir_reconfigure_irte(irqd, force_host); } /* @@ -1240,7 +1247,7 @@ static int intel_ir_set_vcpu_affinity(struct irq_data *data, void *info) /* stop posting interrupts, back to the default mode */ if (!vcpu_pi_info) { - modify_irte(&ir_data->irq_2_iommu, &ir_data->irte_entry); + __intel_ir_reconfigure_irte(data, true); } else { struct irte irte_pi; From 688124cc541f60d26a7547f45637b23dada4e527 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 19 Mar 2025 10:21:00 +0800 Subject: [PATCH 53/54] iommu/vt-d: Don't clobber posted vCPU IRTE when host IRQ affinity changes Don't overwrite an IRTE that is posting IRQs to a vCPU with a posted MSI entry if the host IRQ affinity happens to change. If/when the IRTE is reverted back to "host mode", it will be reconfigured as a posted MSI or remapped entry as appropriate. Drop the "mode" field, which doesn't differentiate between posted MSIs and posted vCPUs, in favor of a dedicated posted_vcpu flag. Note! The two posted_{msi,vcpu} flags are intentionally not mutually exclusive; an IRTE can transition between posted MSI and posted vCPU. Fixes: ed1e48ea4370 ("iommu/vt-d: Enable posted mode for device MSIs") Cc: stable@vger.kernel.org Cc: Thomas Gleixner Cc: Jacob Pan Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20250315025135.2365846-3-seanjc@google.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/irq_remapping.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c index c495b533103f3..ea3ca52039196 100644 --- a/drivers/iommu/intel/irq_remapping.c +++ b/drivers/iommu/intel/irq_remapping.c @@ -25,11 +25,6 @@ #include "../irq_remapping.h" #include "../iommu-pages.h" -enum irq_mode { - IRQ_REMAPPING, - IRQ_POSTING, -}; - struct ioapic_scope { struct intel_iommu *iommu; unsigned int id; @@ -49,8 +44,8 @@ struct irq_2_iommu { u16 irte_index; u16 sub_handle; u8 irte_mask; - enum irq_mode mode; bool posted_msi; + bool posted_vcpu; }; struct intel_ir_data { @@ -138,7 +133,6 @@ static int alloc_irte(struct intel_iommu *iommu, irq_iommu->irte_index = index; irq_iommu->sub_handle = 0; irq_iommu->irte_mask = mask; - irq_iommu->mode = IRQ_REMAPPING; } raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags); @@ -193,8 +187,6 @@ static int modify_irte(struct irq_2_iommu *irq_iommu, rc = qi_flush_iec(iommu, index, 0); - /* Update iommu mode according to the IRTE mode */ - irq_iommu->mode = irte->pst ? IRQ_POSTING : IRQ_REMAPPING; raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags); return rc; @@ -1173,9 +1165,18 @@ static void __intel_ir_reconfigure_irte(struct irq_data *irqd, bool force_host) { struct intel_ir_data *ir_data = irqd->chip_data; + /* + * Don't modify IRTEs for IRQs that are being posted to vCPUs if the + * host CPU affinity changes. + */ + if (ir_data->irq_2_iommu.posted_vcpu && !force_host) + return; + + ir_data->irq_2_iommu.posted_vcpu = false; + if (ir_data->irq_2_iommu.posted_msi) intel_ir_reconfigure_irte_posted(irqd); - else if (force_host || ir_data->irq_2_iommu.mode == IRQ_REMAPPING) + else modify_irte(&ir_data->irq_2_iommu, &ir_data->irte_entry); } @@ -1270,6 +1271,7 @@ static int intel_ir_set_vcpu_affinity(struct irq_data *data, void *info) irte_pi.pda_h = (vcpu_pi_info->pi_desc_addr >> 32) & ~(-1UL << PDA_HIGH_BIT); + ir_data->irq_2_iommu.posted_vcpu = true; modify_irte(&ir_data->irq_2_iommu, &irte_pi); } @@ -1496,6 +1498,9 @@ static void intel_irq_remapping_deactivate(struct irq_domain *domain, struct intel_ir_data *data = irq_data->chip_data; struct irte entry; + WARN_ON_ONCE(data->irq_2_iommu.posted_vcpu); + data->irq_2_iommu.posted_vcpu = false; + memset(&entry, 0, sizeof(entry)); modify_irte(&data->irq_2_iommu, &entry); } From 93ae6e68b6d6b62d92b3a89d1c253d4a1721a1d3 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 19 Mar 2025 10:21:01 +0800 Subject: [PATCH 54/54] iommu/vt-d: Fix possible circular locking dependency We have recently seen report of lockdep circular lock dependency warnings on platforms like Skylake and Kabylake: ====================================================== WARNING: possible circular locking dependency detected 6.14.0-rc6-CI_DRM_16276-gca2c04fe76e8+ #1 Not tainted ------------------------------------------------------ swapper/0/1 is trying to acquire lock: ffffffff8360ee48 (iommu_probe_device_lock){+.+.}-{3:3}, at: iommu_probe_device+0x1d/0x70 but task is already holding lock: ffff888102c7efa8 (&device->physical_node_lock){+.+.}-{3:3}, at: intel_iommu_init+0xe75/0x11f0 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #6 (&device->physical_node_lock){+.+.}-{3:3}: __mutex_lock+0xb4/0xe40 mutex_lock_nested+0x1b/0x30 intel_iommu_init+0xe75/0x11f0 pci_iommu_init+0x13/0x70 do_one_initcall+0x62/0x3f0 kernel_init_freeable+0x3da/0x6a0 kernel_init+0x1b/0x200 ret_from_fork+0x44/0x70 ret_from_fork_asm+0x1a/0x30 -> #5 (dmar_global_lock){++++}-{3:3}: down_read+0x43/0x1d0 enable_drhd_fault_handling+0x21/0x110 cpuhp_invoke_callback+0x4c6/0x870 cpuhp_issue_call+0xbf/0x1f0 __cpuhp_setup_state_cpuslocked+0x111/0x320 __cpuhp_setup_state+0xb0/0x220 irq_remap_enable_fault_handling+0x3f/0xa0 apic_intr_mode_init+0x5c/0x110 x86_late_time_init+0x24/0x40 start_kernel+0x895/0xbd0 x86_64_start_reservations+0x18/0x30 x86_64_start_kernel+0xbf/0x110 common_startup_64+0x13e/0x141 -> #4 (cpuhp_state_mutex){+.+.}-{3:3}: __mutex_lock+0xb4/0xe40 mutex_lock_nested+0x1b/0x30 __cpuhp_setup_state_cpuslocked+0x67/0x320 __cpuhp_setup_state+0xb0/0x220 page_alloc_init_cpuhp+0x2d/0x60 mm_core_init+0x18/0x2c0 start_kernel+0x576/0xbd0 x86_64_start_reservations+0x18/0x30 x86_64_start_kernel+0xbf/0x110 common_startup_64+0x13e/0x141 -> #3 (cpu_hotplug_lock){++++}-{0:0}: __cpuhp_state_add_instance+0x4f/0x220 iova_domain_init_rcaches+0x214/0x280 iommu_setup_dma_ops+0x1a4/0x710 iommu_device_register+0x17d/0x260 intel_iommu_init+0xda4/0x11f0 pci_iommu_init+0x13/0x70 do_one_initcall+0x62/0x3f0 kernel_init_freeable+0x3da/0x6a0 kernel_init+0x1b/0x200 ret_from_fork+0x44/0x70 ret_from_fork_asm+0x1a/0x30 -> #2 (&domain->iova_cookie->mutex){+.+.}-{3:3}: __mutex_lock+0xb4/0xe40 mutex_lock_nested+0x1b/0x30 iommu_setup_dma_ops+0x16b/0x710 iommu_device_register+0x17d/0x260 intel_iommu_init+0xda4/0x11f0 pci_iommu_init+0x13/0x70 do_one_initcall+0x62/0x3f0 kernel_init_freeable+0x3da/0x6a0 kernel_init+0x1b/0x200 ret_from_fork+0x44/0x70 ret_from_fork_asm+0x1a/0x30 -> #1 (&group->mutex){+.+.}-{3:3}: __mutex_lock+0xb4/0xe40 mutex_lock_nested+0x1b/0x30 __iommu_probe_device+0x24c/0x4e0 probe_iommu_group+0x2b/0x50 bus_for_each_dev+0x7d/0xe0 iommu_device_register+0xe1/0x260 intel_iommu_init+0xda4/0x11f0 pci_iommu_init+0x13/0x70 do_one_initcall+0x62/0x3f0 kernel_init_freeable+0x3da/0x6a0 kernel_init+0x1b/0x200 ret_from_fork+0x44/0x70 ret_from_fork_asm+0x1a/0x30 -> #0 (iommu_probe_device_lock){+.+.}-{3:3}: __lock_acquire+0x1637/0x2810 lock_acquire+0xc9/0x300 __mutex_lock+0xb4/0xe40 mutex_lock_nested+0x1b/0x30 iommu_probe_device+0x1d/0x70 intel_iommu_init+0xe90/0x11f0 pci_iommu_init+0x13/0x70 do_one_initcall+0x62/0x3f0 kernel_init_freeable+0x3da/0x6a0 kernel_init+0x1b/0x200 ret_from_fork+0x44/0x70 ret_from_fork_asm+0x1a/0x30 other info that might help us debug this: Chain exists of: iommu_probe_device_lock --> dmar_global_lock --> &device->physical_node_lock Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&device->physical_node_lock); lock(dmar_global_lock); lock(&device->physical_node_lock); lock(iommu_probe_device_lock); *** DEADLOCK *** This driver uses a global lock to protect the list of enumerated DMA remapping units. It is necessary due to the driver's support for dynamic addition and removal of remapping units at runtime. Two distinct code paths require iteration over this remapping unit list: - Device registration and probing: the driver iterates the list to register each remapping unit with the upper layer IOMMU framework and subsequently probe the devices managed by that unit. - Global configuration: Upper layer components may also iterate the list to apply configuration changes. The lock acquisition order between these two code paths was reversed. This caused lockdep warnings, indicating a risk of deadlock. Fix this warning by releasing the global lock before invoking upper layer interfaces for device registration. Fixes: b150654f74bf ("iommu/vt-d: Fix suspicious RCU usage") Closes: https://lore.kernel.org/linux-iommu/SJ1PR11MB612953431F94F18C954C4A9CB9D32@SJ1PR11MB6129.namprd11.prod.outlook.com/ Tested-by: Chaitanya Kumar Borah Cc: stable@vger.kernel.org Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20250317035714.1041549-1-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 072223fe6fe76..4eafc0518fa7c 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3049,6 +3049,7 @@ static int __init probe_acpi_namespace_devices(void) if (dev->bus != &acpi_bus_type) continue; + up_read(&dmar_global_lock); adev = to_acpi_device(dev); mutex_lock(&adev->physical_node_lock); list_for_each_entry(pn, @@ -3058,6 +3059,7 @@ static int __init probe_acpi_namespace_devices(void) break; } mutex_unlock(&adev->physical_node_lock); + down_read(&dmar_global_lock); if (ret) return ret;