From 2b938e3db335e3670475e31a722c2bee34748c5a Mon Sep 17 00:00:00 2001 From: Ramesh Thomas Date: Tue, 10 Dec 2024 05:19:37 -0800 Subject: [PATCH 1/9] vfio/pci: Enable iowrite64 and ioread64 for vfio pci Definitions of ioread64 and iowrite64 macros in asm/io.h called by vfio pci implementations are enclosed inside check for CONFIG_GENERIC_IOMAP. They don't get defined if CONFIG_GENERIC_IOMAP is defined. Include linux/io-64-nonatomic-lo-hi.h to define iowrite64 and ioread64 macros when they are not defined. io-64-nonatomic-lo-hi.h maps the macros to generic implementation in lib/iomap.c. The generic implementation does 64 bit rw if readq/writeq is defined for the architecture, otherwise it would do 32 bit back to back rw. Note that there are two versions of the generic implementation that differs in the order the 32 bit words are written if 64 bit support is not present. This is not the little/big endian ordering, which is handled separately. This patch uses the lo followed by hi word ordering which is consistent with current back to back implementation in the vfio/pci code. Signed-off-by: Ramesh Thomas Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241210131938.303500-2-ramesh.thomas@intel.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_rdwr.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c index 66b72c2892841..a0595c745732a 100644 --- a/drivers/vfio/pci/vfio_pci_rdwr.c +++ b/drivers/vfio/pci/vfio_pci_rdwr.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "vfio_pci_priv.h" From b44a06bd28f2732393c2019e3ae0e593ef76c867 Mon Sep 17 00:00:00 2001 From: Ramesh Thomas Date: Tue, 10 Dec 2024 05:19:38 -0800 Subject: [PATCH 2/9] vfio/pci: Remove #ifdef iowrite64 and #ifdef ioread64 Remove the #ifdef iowrite64 and #ifdef ioread64 checks around calls to 64 bit IO access. Since default implementations have been enabled, the checks are not required. Signed-off-by: Ramesh Thomas Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241210131938.303500-3-ramesh.thomas@intel.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_rdwr.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c index a0595c745732a..78a3d0809415e 100644 --- a/drivers/vfio/pci/vfio_pci_rdwr.c +++ b/drivers/vfio/pci/vfio_pci_rdwr.c @@ -62,9 +62,7 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_iowrite##size); VFIO_IOWRITE(8) VFIO_IOWRITE(16) VFIO_IOWRITE(32) -#ifdef iowrite64 VFIO_IOWRITE(64) -#endif #define VFIO_IOREAD(size) \ int vfio_pci_core_ioread##size(struct vfio_pci_core_device *vdev, \ @@ -90,9 +88,7 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_ioread##size); VFIO_IOREAD(8) VFIO_IOREAD(16) VFIO_IOREAD(32) -#ifdef ioread64 VFIO_IOREAD(64) -#endif #define VFIO_IORDWR(size) \ static int vfio_pci_iordwr##size(struct vfio_pci_core_device *vdev,\ @@ -128,9 +124,7 @@ static int vfio_pci_iordwr##size(struct vfio_pci_core_device *vdev,\ VFIO_IORDWR(8) VFIO_IORDWR(16) VFIO_IORDWR(32) -#if defined(ioread64) && defined(iowrite64) VFIO_IORDWR(64) -#endif /* * Read or write from an __iomem region (MMIO or I/O port) with an excluded @@ -156,7 +150,6 @@ ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem, else fillable = 0; -#if defined(ioread64) && defined(iowrite64) if (fillable >= 8 && !(off % 8)) { ret = vfio_pci_iordwr64(vdev, iswrite, test_mem, io, buf, off, &filled); @@ -164,7 +157,6 @@ ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem, return ret; } else -#endif if (fillable >= 4 && !(off % 4)) { ret = vfio_pci_iordwr32(vdev, iswrite, test_mem, io, buf, off, &filled); @@ -382,12 +374,10 @@ static void vfio_pci_ioeventfd_do_write(struct vfio_pci_ioeventfd *ioeventfd, vfio_pci_core_iowrite32(ioeventfd->vdev, test_mem, ioeventfd->data, ioeventfd->addr); break; -#ifdef iowrite64 case 8: vfio_pci_core_iowrite64(ioeventfd->vdev, test_mem, ioeventfd->data, ioeventfd->addr); break; -#endif } } @@ -441,10 +431,8 @@ int vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset, pos >= vdev->msix_offset + vdev->msix_size)) return -EINVAL; -#ifndef iowrite64 if (count == 8) return -EINVAL; -#endif ret = vfio_pci_core_setup_barmap(vdev, bar); if (ret) From c5a8b5d740ef3dde319562d2e969888b4b8dfdd8 Mon Sep 17 00:00:00 2001 From: Yunxiang Li Date: Thu, 2 Jan 2025 13:50:12 -0500 Subject: [PATCH 3/9] vfio/pci: Remove shadow ROM specific code paths After commit 0c0e0736acad ("PCI: Set ROM shadow location in arch code, not in PCI core"), the shadow ROM works the same as regular ROM BARs so these code paths are no longer needed. Signed-off-by: Yunxiang Li Link: https://lore.kernel.org/r/20250102185013.15082-2-Yunxiang.Li@amd.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_config.c | 8 ++------ drivers/vfio/pci/vfio_pci_core.c | 10 ++-------- drivers/vfio/pci/vfio_pci_rdwr.c | 3 --- 3 files changed, 4 insertions(+), 17 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index ea2745c1ac5e6..e41c3a965663e 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -511,13 +511,9 @@ static void vfio_bar_fixup(struct vfio_pci_core_device *vdev) mask = ~(pci_resource_len(pdev, PCI_ROM_RESOURCE) - 1); mask |= PCI_ROM_ADDRESS_ENABLE; *vbar &= cpu_to_le32((u32)mask); - } else if (pdev->resource[PCI_ROM_RESOURCE].flags & - IORESOURCE_ROM_SHADOW) { - mask = ~(0x20000 - 1); - mask |= PCI_ROM_ADDRESS_ENABLE; - *vbar &= cpu_to_le32((u32)mask); - } else + } else { *vbar = 0; + } vdev->bardirty = false; } diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 1a4ed5a357d36..b97a3833ffd14 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1057,14 +1057,8 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev, /* Report the BAR size, not the ROM size */ info.size = pci_resource_len(pdev, info.index); - if (!info.size) { - /* Shadow ROMs appear as PCI option ROMs */ - if (pdev->resource[PCI_ROM_RESOURCE].flags & - IORESOURCE_ROM_SHADOW) - info.size = 0x20000; - else - break; - } + if (!info.size) + break; /* * Is it really there? Enable memory decode for implicit access diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c index 78a3d0809415e..0f74f58571fb1 100644 --- a/drivers/vfio/pci/vfio_pci_rdwr.c +++ b/drivers/vfio/pci/vfio_pci_rdwr.c @@ -237,9 +237,6 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf, if (pci_resource_start(pdev, bar)) end = pci_resource_len(pdev, bar); - else if (bar == PCI_ROM_RESOURCE && - pdev->resource[bar].flags & IORESOURCE_ROM_SHADOW) - end = 0x20000; else return -EINVAL; From e021e6cbfb5a695968afb6619828929a97e4a83a Mon Sep 17 00:00:00 2001 From: Yunxiang Li Date: Thu, 2 Jan 2025 13:50:13 -0500 Subject: [PATCH 4/9] vfio/pci: Expose setup ROM at ROM bar when needed If ROM bar is missing for any reason, we can fallback to using pdev->rom to expose the ROM content to the guest. This fixes some passthrough use cases where the upstream bridge does not have enough address window. Signed-off-by: Yunxiang Li Link: https://lore.kernel.org/r/20250102185013.15082-3-Yunxiang.Li@amd.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_config.c | 4 ++++ drivers/vfio/pci/vfio_pci_core.c | 34 ++++++++++++++++-------------- drivers/vfio/pci/vfio_pci_rdwr.c | 22 +++++++++++++------ 3 files changed, 38 insertions(+), 22 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index e41c3a965663e..4c673d842fb35 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -511,6 +511,10 @@ static void vfio_bar_fixup(struct vfio_pci_core_device *vdev) mask = ~(pci_resource_len(pdev, PCI_ROM_RESOURCE) - 1); mask |= PCI_ROM_ADDRESS_ENABLE; *vbar &= cpu_to_le32((u32)mask); + } else if (pdev->rom && pdev->romlen) { + mask = ~(roundup_pow_of_two(pdev->romlen) - 1); + mask |= PCI_ROM_ADDRESS_ENABLE; + *vbar &= cpu_to_le32((u32)mask); } else { *vbar = 0; } diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index b97a3833ffd14..586e49efb81be 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1054,25 +1054,27 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev, info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); info.flags = 0; + info.size = 0; - /* Report the BAR size, not the ROM size */ - info.size = pci_resource_len(pdev, info.index); - if (!info.size) - break; - - /* - * Is it really there? Enable memory decode for implicit access - * in pci_map_rom(). - */ - cmd = vfio_pci_memory_lock_and_enable(vdev); - io = pci_map_rom(pdev, &size); - if (io) { + if (pci_resource_start(pdev, PCI_ROM_RESOURCE)) { + /* + * Check ROM content is valid. Need to enable memory + * decode for ROM access in pci_map_rom(). + */ + cmd = vfio_pci_memory_lock_and_enable(vdev); + io = pci_map_rom(pdev, &size); + if (io) { + info.flags = VFIO_REGION_INFO_FLAG_READ; + /* Report the BAR size, not the ROM size. */ + info.size = pci_resource_len(pdev, PCI_ROM_RESOURCE); + pci_unmap_rom(pdev, io); + } + vfio_pci_memory_unlock_and_restore(vdev, cmd); + } else if (pdev->rom && pdev->romlen) { info.flags = VFIO_REGION_INFO_FLAG_READ; - pci_unmap_rom(pdev, io); - } else { - info.size = 0; + /* Report BAR size as power of two. */ + info.size = roundup_pow_of_two(pdev->romlen); } - vfio_pci_memory_unlock_and_restore(vdev, cmd); break; } diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c index 0f74f58571fb1..6192788c8ba39 100644 --- a/drivers/vfio/pci/vfio_pci_rdwr.c +++ b/drivers/vfio/pci/vfio_pci_rdwr.c @@ -237,6 +237,8 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf, if (pci_resource_start(pdev, bar)) end = pci_resource_len(pdev, bar); + else if (bar == PCI_ROM_RESOURCE && pdev->rom && pdev->romlen) + end = roundup_pow_of_two(pdev->romlen); else return -EINVAL; @@ -251,11 +253,14 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf, * excluded range at the end of the actual ROM. This makes * filling large ROM BARs much faster. */ - io = pci_map_rom(pdev, &x_start); - if (!io) { - done = -ENOMEM; - goto out; + if (pci_resource_start(pdev, bar)) { + io = pci_map_rom(pdev, &x_start); + } else { + io = ioremap(pdev->rom, pdev->romlen); + x_start = pdev->romlen; } + if (!io) + return -ENOMEM; x_end = end; } else { int ret = vfio_pci_core_setup_barmap(vdev, bar); @@ -278,8 +283,13 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf, if (done >= 0) *ppos += done; - if (bar == PCI_ROM_RESOURCE) - pci_unmap_rom(pdev, io); + if (bar == PCI_ROM_RESOURCE) { + if (pci_resource_start(pdev, bar)) + pci_unmap_rom(pdev, io); + else + iounmap(io); + } + out: return done; } From ce9ff21ea89d191e477a02ad7eabf4f996b80a69 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Wed, 22 Jan 2025 10:38:30 -0700 Subject: [PATCH 5/9] vfio/platform: check the bounds of read/write syscalls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit count and offset are passed from user space and not checked, only offset is capped to 40 bits, which can be used to read/write out of bounds of the device. Fixes: 6e3f26456009 (“vfio/platform: read and write support for the device fd”) Cc: stable@vger.kernel.org Reported-by: Mostafa Saleh Reviewed-by: Eric Auger Reviewed-by: Mostafa Saleh Tested-by: Mostafa Saleh Signed-off-by: Alex Williamson --- drivers/vfio/platform/vfio_platform_common.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c index e53757d1d0958..3bf1043cd7957 100644 --- a/drivers/vfio/platform/vfio_platform_common.c +++ b/drivers/vfio/platform/vfio_platform_common.c @@ -388,6 +388,11 @@ static ssize_t vfio_platform_read_mmio(struct vfio_platform_region *reg, { unsigned int done = 0; + if (off >= reg->size) + return -EINVAL; + + count = min_t(size_t, count, reg->size - off); + if (!reg->ioaddr) { reg->ioaddr = ioremap(reg->addr, reg->size); @@ -467,6 +472,11 @@ static ssize_t vfio_platform_write_mmio(struct vfio_platform_region *reg, { unsigned int done = 0; + if (off >= reg->size) + return -EINVAL; + + count = min_t(size_t, count, reg->size - off); + if (!reg->ioaddr) { reg->ioaddr = ioremap(reg->addr, reg->size); From bd53764a60ad586ad5b6ed339423ad5e67824464 Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Fri, 24 Jan 2025 18:30:59 +0000 Subject: [PATCH 6/9] vfio/nvgrace-gpu: Read dvsec register to determine need for uncached resmem NVIDIA's recently introduced Grace Blackwell (GB) Superchip is a continuation with the Grace Hopper (GH) superchip that provides a cache coherent access to CPU and GPU to each other's memory with an internal proprietary chip-to-chip cache coherent interconnect. There is a HW defect on GH systems to support the Multi-Instance GPU (MIG) feature [1] that necessiated the presence of a 1G region with uncached mapping carved out from the device memory. The 1G region is shown as a fake BAR (comprising region 2 and 3) to workaround the issue. This is fixed on the GB systems. The presence of the fix for the HW defect is communicated by the device firmware through the DVSEC PCI config register with ID 3. The module reads this to take a different codepath on GB vs GH. Scan through the DVSEC registers to identify the correct one and use it to determine the presence of the fix. Save the value in the device's nvgrace_gpu_pci_core_device structure. Link: https://www.nvidia.com/en-in/technologies/multi-instance-gpu/ [1] CC: Jason Gunthorpe CC: Kevin Tian Signed-off-by: Ankit Agrawal Link: https://lore.kernel.org/r/20250124183102.3976-2-ankita@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/nvgrace-gpu/main.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index a467085038f0c..b76368958d1c5 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -23,6 +23,11 @@ /* A hardwired and constant ABI value between the GPU FW and VFIO driver. */ #define MEMBLK_SIZE SZ_512M +#define DVSEC_BITMAP_OFFSET 0xA +#define MIG_SUPPORTED_WITH_CACHED_RESMEM BIT(0) + +#define GPU_CAP_DVSEC_REGISTER 3 + /* * The state of the two device memory region - resmem and usemem - is * saved as struct mem_region. @@ -46,6 +51,7 @@ struct nvgrace_gpu_pci_core_device { struct mem_region resmem; /* Lock to control device memory kernel mapping */ struct mutex remap_lock; + bool has_mig_hw_bug; }; static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev) @@ -812,6 +818,26 @@ nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev, return ret; } +static bool nvgrace_gpu_has_mig_hw_bug(struct pci_dev *pdev) +{ + int pcie_dvsec; + u16 dvsec_ctrl16; + + pcie_dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_NVIDIA, + GPU_CAP_DVSEC_REGISTER); + + if (pcie_dvsec) { + pci_read_config_word(pdev, + pcie_dvsec + DVSEC_BITMAP_OFFSET, + &dvsec_ctrl16); + + if (dvsec_ctrl16 & MIG_SUPPORTED_WITH_CACHED_RESMEM) + return false; + } + + return true; +} + static int nvgrace_gpu_probe(struct pci_dev *pdev, const struct pci_device_id *id) { @@ -832,6 +858,8 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev, dev_set_drvdata(&pdev->dev, &nvdev->core_device); if (ops == &nvgrace_gpu_pci_ops) { + nvdev->has_mig_hw_bug = nvgrace_gpu_has_mig_hw_bug(pdev); + /* * Device memory properties are identified in the host ACPI * table. Set the nvgrace_gpu_pci_core_device structure. From 6a9eb2d125ba90d13b45bcfabcddf9f61268f6a8 Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Fri, 24 Jan 2025 18:31:00 +0000 Subject: [PATCH 7/9] vfio/nvgrace-gpu: Expose the blackwell device PF BAR1 to the VM There is a HW defect on Grace Hopper (GH) to support the Multi-Instance GPU (MIG) feature [1] that necessiated the presence of a 1G region carved out from the device memory and mapped as uncached. The 1G region is shown as a fake BAR (comprising region 2 and 3) to workaround the issue. The Grace Blackwell systems (GB) differ from GH systems in the following aspects: 1. The aforementioned HW defect is fixed on GB systems. 2. There is a usable BAR1 (region 2 and 3) on GB systems for the GPUdirect RDMA feature [2]. This patch accommodate those GB changes by showing the 64b physical device BAR1 (region2 and 3) to the VM instead of the fake one. This takes care of both the differences. Moreover, the entire device memory is exposed on GB as cacheable to the VM as there is no carveout required. Link: https://www.nvidia.com/en-in/technologies/multi-instance-gpu/ [1] Link: https://docs.nvidia.com/cuda/gpudirect-rdma/ [2] Cc: Kevin Tian CC: Jason Gunthorpe Suggested-by: Alex Williamson Signed-off-by: Ankit Agrawal Link: https://lore.kernel.org/r/20250124183102.3976-3-ankita@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/nvgrace-gpu/main.c | 67 +++++++++++++++++++---------- 1 file changed, 45 insertions(+), 22 deletions(-) diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index b76368958d1c5..778bfd0655de0 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -17,9 +17,6 @@ #define RESMEM_REGION_INDEX VFIO_PCI_BAR2_REGION_INDEX #define USEMEM_REGION_INDEX VFIO_PCI_BAR4_REGION_INDEX -/* Memory size expected as non cached and reserved by the VM driver */ -#define RESMEM_SIZE SZ_1G - /* A hardwired and constant ABI value between the GPU FW and VFIO driver. */ #define MEMBLK_SIZE SZ_512M @@ -72,7 +69,7 @@ nvgrace_gpu_memregion(int index, if (index == USEMEM_REGION_INDEX) return &nvdev->usemem; - if (index == RESMEM_REGION_INDEX) + if (nvdev->resmem.memlength && index == RESMEM_REGION_INDEX) return &nvdev->resmem; return NULL; @@ -757,40 +754,67 @@ nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev, u64 memphys, u64 memlength) { int ret = 0; + u64 resmem_size = 0; /* - * The VM GPU device driver needs a non-cacheable region to support - * the MIG feature. Since the device memory is mapped as NORMAL cached, - * carve out a region from the end with a different NORMAL_NC - * property (called as reserved memory and represented as resmem). This - * region then is exposed as a 64b BAR (region 2 and 3) to the VM, while - * exposing the rest (termed as usable memory and represented using usemem) - * as cacheable 64b BAR (region 4 and 5). + * On Grace Hopper systems, the VM GPU device driver needs a non-cacheable + * region to support the MIG feature owing to a hardware bug. Since the + * device memory is mapped as NORMAL cached, carve out a region from the end + * with a different NORMAL_NC property (called as reserved memory and + * represented as resmem). This region then is exposed as a 64b BAR + * (region 2 and 3) to the VM, while exposing the rest (termed as usable + * memory and represented using usemem) as cacheable 64b BAR (region 4 and 5). * * devmem (memlength) * |-------------------------------------------------| * | | * usemem.memphys resmem.memphys + * + * This hardware bug is fixed on the Grace Blackwell platforms and the + * presence of the bug can be determined through nvdev->has_mig_hw_bug. + * Thus on systems with the hardware fix, there is no need to partition + * the GPU device memory and the entire memory is usable and mapped as + * NORMAL cached (i.e. resmem size is 0). */ + if (nvdev->has_mig_hw_bug) + resmem_size = SZ_1G; + nvdev->usemem.memphys = memphys; /* * The device memory exposed to the VM is added to the kernel by the - * VM driver module in chunks of memory block size. Only the usable - * memory (usemem) is added to the kernel for usage by the VM - * workloads. Make the usable memory size memblock aligned. + * VM driver module in chunks of memory block size. Note that only the + * usable memory (usemem) is added to the kernel for usage by the VM + * workloads. */ - if (check_sub_overflow(memlength, RESMEM_SIZE, + if (check_sub_overflow(memlength, resmem_size, &nvdev->usemem.memlength)) { ret = -EOVERFLOW; goto done; } /* - * The USEMEM part of the device memory has to be MEMBLK_SIZE - * aligned. This is a hardwired ABI value between the GPU FW and - * VFIO driver. The VM device driver is also aware of it and make - * use of the value for its calculation to determine USEMEM size. + * The usemem region is exposed as a 64B Bar composed of region 4 and 5. + * Calculate and save the BAR size for the region. + */ + nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength); + + /* + * If the hardware has the fix for MIG, there is no requirement + * for splitting the device memory to create RESMEM. The entire + * device memory is usable and will be USEMEM. Return here for + * such case. + */ + if (!nvdev->has_mig_hw_bug) + goto done; + + /* + * When the device memory is split to workaround the MIG bug on + * Grace Hopper, the USEMEM part of the device memory has to be + * MEMBLK_SIZE aligned. This is a hardwired ABI value between the + * GPU FW and VFIO driver. The VM device driver is also aware of it + * and make use of the value for its calculation to determine USEMEM + * size. Note that the device memory may not be 512M aligned. */ nvdev->usemem.memlength = round_down(nvdev->usemem.memlength, MEMBLK_SIZE); @@ -809,10 +833,9 @@ nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev, } /* - * The memory regions are exposed as BARs. Calculate and save - * the BAR size for them. + * The resmem region is exposed as a 64b BAR composed of region 2 and 3 + * for Grace Hopper. Calculate and save the BAR size for the region. */ - nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength); nvdev->resmem.bar_size = roundup_pow_of_two(nvdev->resmem.memlength); done: return ret; From d85f69d520e6aca8ae5ce353666e2fc2756eb9e7 Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Fri, 24 Jan 2025 18:31:01 +0000 Subject: [PATCH 8/9] vfio/nvgrace-gpu: Check the HBM training and C2C link status In contrast to Grace Hopper systems, the HBM training has been moved out of the UEFI on the Grace Blackwell systems. This reduces the system bootup time significantly. The onus of checking whether the HBM training has completed thus falls on the module. The HBM training status can be determined from a BAR0 register. Similarly, another BAR0 register exposes the status of the CPU-GPU chip-to-chip (C2C) cache coherent interconnect. Based on testing, 30s is determined to be sufficient to ensure initialization completion on all the Grace based systems. Thus poll these register and check for 30s. If the HBM training is not complete or if the C2C link is not ready, fail the probe. While the time is not required on Grace Hopper systems, it is beneficial to make the check to ensure the device is in an expected state. Hence keeping it generalized to both the generations. Ensure that the BAR0 is enabled before accessing the registers. CC: Alex Williamson CC: Kevin Tian CC: Jason Gunthorpe Signed-off-by: Ankit Agrawal Link: https://lore.kernel.org/r/20250124183102.3976-4-ankita@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/nvgrace-gpu/main.c | 72 +++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index 778bfd0655de0..655a624134cca 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -5,6 +5,8 @@ #include #include +#include +#include /* * The device memory usable to the workloads running in the VM is cached @@ -25,6 +27,13 @@ #define GPU_CAP_DVSEC_REGISTER 3 +#define C2C_LINK_BAR0_OFFSET 0x1498 +#define HBM_TRAINING_BAR0_OFFSET 0x200BC +#define STATUS_READY 0xFF + +#define POLL_QUANTUM_MS 1000 +#define POLL_TIMEOUT_MS (30 * 1000) + /* * The state of the two device memory region - resmem and usemem - is * saved as struct mem_region. @@ -861,6 +870,65 @@ static bool nvgrace_gpu_has_mig_hw_bug(struct pci_dev *pdev) return true; } +/* + * To reduce the system bootup time, the HBM training has + * been moved out of the UEFI on the Grace-Blackwell systems. + * + * The onus of checking whether the HBM training has completed + * thus falls on the module. The HBM training status can be + * determined from a BAR0 register. + * + * Similarly, another BAR0 register exposes the status of the + * CPU-GPU chip-to-chip (C2C) cache coherent interconnect. + * + * Poll these register and check for 30s. If the HBM training is + * not complete or if the C2C link is not ready, fail the probe. + * + * While the wait is not required on Grace Hopper systems, it + * is beneficial to make the check to ensure the device is in an + * expected state. + * + * Ensure that the BAR0 region is enabled before accessing the + * registers. + */ +static int nvgrace_gpu_wait_device_ready(struct pci_dev *pdev) +{ + unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS); + void __iomem *io; + int ret = -ETIME; + + ret = pci_enable_device(pdev); + if (ret) + return ret; + + ret = pci_request_selected_regions(pdev, 1 << 0, KBUILD_MODNAME); + if (ret) + goto request_region_exit; + + io = pci_iomap(pdev, 0, 0); + if (!io) { + ret = -ENOMEM; + goto iomap_exit; + } + + do { + if ((ioread32(io + C2C_LINK_BAR0_OFFSET) == STATUS_READY) && + (ioread32(io + HBM_TRAINING_BAR0_OFFSET) == STATUS_READY)) { + ret = 0; + goto reg_check_exit; + } + msleep(POLL_QUANTUM_MS); + } while (!time_after(jiffies, timeout)); + +reg_check_exit: + pci_iounmap(pdev, io); +iomap_exit: + pci_release_selected_regions(pdev, 1 << 0); +request_region_exit: + pci_disable_device(pdev); + return ret; +} + static int nvgrace_gpu_probe(struct pci_dev *pdev, const struct pci_device_id *id) { @@ -869,6 +937,10 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev, u64 memphys, memlength; int ret; + ret = nvgrace_gpu_wait_device_ready(pdev); + if (ret) + return ret; + ret = nvgrace_gpu_fetch_memory_property(pdev, &memphys, &memlength); if (!ret) ops = &nvgrace_gpu_pci_ops; From 2bb447540e71ee530388750c38e1b2c8ea08b4b7 Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Fri, 24 Jan 2025 18:31:02 +0000 Subject: [PATCH 9/9] vfio/nvgrace-gpu: Add GB200 SKU to the devid table NVIDIA is productizing the new Grace Blackwell superchip SKU bearing device ID 0x2941. Add the SKU devid to nvgrace_gpu_vfio_pci_table. CC: Alex Williamson Signed-off-by: Ankit Agrawal Link: https://lore.kernel.org/r/20250124183102.3976-5-ankita@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/nvgrace-gpu/main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index 655a624134cca..e5ac39c4cc6b6 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -991,6 +991,8 @@ static const struct pci_device_id nvgrace_gpu_vfio_pci_table[] = { { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2345) }, /* GH200 SKU */ { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2348) }, + /* GB200 SKU */ + { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2941) }, {} };