From 60e35355284dbaaeae2e627244ff3346ff9a086f Mon Sep 17 00:00:00 2001
From: Shanker Donthineni <sdonthineni@nvidia.com>
Date: Wed, 8 May 2024 15:18:06 -0500
Subject: [PATCH 1/7] NVIDIA: SAUCE: hw/arm: GB200 DirectNIC GPA=HPA
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Nvidia’s next generation GB200 platform has Blackwell GPU and CX8 directly
connected through PCIe Gen6 x16 link. Direct P2P PCIe traffic between GPU
and NIC is possible however it requires ATS at its core and Grace CPU does
not support PCIe ATS. GPA=HPA solution removes the need for GPA to HPA
address translation by configuring PCIe BARs in the VM with HPA. It also
enables ACPI PCI DSM by setting ‘preserve_config’ to true to avoid VM from
reconfiguring the PCI BARs during boot.

Here is the example of PCIe topology that shows GPU and CX8 behind the PCIe Switch:

$ lspci -vt
-[0000:00]---00.0-[01-07]----00.0-[02-07]--+-00.0-[03]--+-00.0  Mellanox Technologies CX8 Family [ConnectX-8]
                                           |            \-00.1  Mellanox Technologies CX8 Family [ConnectX-8]
                                           \-03.0-[04-07]----00.0-[05-07]--+-08.0-[06]--
                                                                           \-0c.0-[07]--
-[0002:00]---00.0-[01-07]----00.0-[02-07]--+-00.0-[03]--+-00.0  Mellanox Technologies CX8 Family [ConnectX-8]
                                           |            \-00.1  Mellanox Technologies CX8 Family [ConnectX-8]
                                           \-01.0-[04-07]----00.0-[05-07]--+-08.0-[06]--
                                                                           \-0c.0-[07]--
-[0005:00]---00.0-[01-0a]----00.0-[02-0a]--+-01.0-[03]--
                                           +-02.0-[04]--
                                           +-03.0-[05]--
                                           +-04.0-[06-07]----00.0-[07]----00.0  ASPEED Technology, Inc. ASPEED Graphics Family
                                           +-05.0-[08]----00.0  Renesas Technology Corp. uPD720201 USB 3.0 Host Controller
                                           +-06.0-[09]----00.0  Intel Corporation I210 Gigabit Network Connection
                                           \-07.0-[0a]--
-[0006:00]---00.0-[01-09]----00.0-[02-09]--+-00.0-[03]--+-00.0  Mellanox Technologies MT43244 BlueField-3 integrated ConnectX-7 network controller
                                           |            +-00.1  Mellanox Technologies MT43244 BlueField-3 integrated ConnectX-7 network controller
                                           |            \-00.2  Mellanox Technologies MT43244 BlueField-3 SoC Management Interface
                                           \-02.0-[04-09]----00.0-[05-09]--+-00.0-[06]----00.0  Samsung Electronics Co Ltd NVMe SSD Controller PM9A1/PM9A3/980PRO
                                                                           +-04.0-[07]----00.0  Samsung Electronics Co Ltd NVMe SSD Controller PM9A1/PM9A3/980PRO
                                                                           +-08.0-[08]----00.0  Samsung Electronics Co Ltd NVMe SSD Controller PM9A1/PM9A3/980PRO
                                                                           \-0c.0-[09]----00.0  Samsung Electronics Co Ltd NVMe SSD Controller PM9A1/PM9A3/980PRO
-[0008:00]---00.0-[01-06]----00.0-[02-06]--+-00.0-[03]----00.0  Mellanox Technologies Device 2100
                                           \-03.0-[04-06]----00.0-[05-06]----00.0-[06]----00.0  NVIDIA Corporation Device 2941
-[0009:00]---00.0-[01-06]----00.0-[02-06]--+-00.0-[03]----00.0  Mellanox Technologies Device 2100
                                           \-01.0-[04-06]----00.0-[05-06]----00.0-[06]----00.0  NVIDIA Corporation Device 2941
-[0010:00]---00.0-[01-07]----00.0-[02-07]--+-00.0-[03]--+-00.0  Mellanox Technologies CX8 Family [ConnectX-8]
                                           |            \-00.1  Mellanox Technologies CX8 Family [ConnectX-8]
                                           \-03.0-[04-07]----00.0-[05-07]--+-08.0-[06]--
                                                                           \-0c.0-[07]--
-[0012:00]---00.0-[01-07]----00.0-[02-07]--+-00.0-[03]--+-00.0  Mellanox Technologies CX8 Family [ConnectX-8]
                                           |            \-00.1  Mellanox Technologies CX8 Family [ConnectX-8]
                                           \-01.0-[04-07]----00.0-[05-07]--+-08.0-[06]--
                                                                           \-0c.0-[07]--
-[0015:00]---00.0-[01]----00.0  Samsung Electronics Co Ltd NVMe SSD Controller PM9A1/PM9A3/980PRO
-[0016:00]---00.0-[01-09]----00.0-[02-09]--+-00.0-[03]--+-00.0  Mellanox Technologies MT43244 BlueField-3 integrated ConnectX-7 network controller
                                           |            +-00.1  Mellanox Technologies MT43244 BlueField-3 integrated ConnectX-7 network controller
                                           |            \-00.2  Mellanox Technologies MT43244 BlueField-3 SoC Management Interface
                                           \-02.0-[04-09]----00.0-[05-09]--+-00.0-[06]----00.0  Samsung Electronics Co Ltd NVMe SSD Controller PM9A1/PM9A3/980PRO
                                                                           +-04.0-[07]----00.0  Samsung Electronics Co Ltd NVMe SSD Controller PM9A1/PM9A3/980PRO
                                                                           +-08.0-[08]----00.0  Samsung Electronics Co Ltd NVMe SSD Controller PM9A1/PM9A3/980PRO
                                                                           \-0c.0-[09]----00.0  Samsung Electronics Co Ltd NVMe SSD Controller PM9A1/PM9A3/980PRO
-[0018:00]---00.0-[01-06]----00.0-[02-06]--+-00.0-[03]----00.0  Mellanox Technologies Device 2100
                                           \-03.0-[04-06]----00.0-[05-06]----00.0-[06]----00.0  NVIDIA Corporation Device 2941
-[0019:00]---00.0-[01-06]----00.0-[02-06]--+-00.0-[03]----00.0  Mellanox Technologies Device 2100
                                           \-01.0-[04-06]----00.0-[05-06]----00.0-[06]----00.0  NVIDIA Corporation Device 2941

GPA=HPA is expected to work with PCIe topology in the VM that resembles to
baremetal. In other words, for P2P PCIe traffic (using GPA=HPA) over Gen6,
CX8 NIC(the DMA-PF) and GPU assigned to VM should be under the same PCIe switch.

Note: PCIe Switch needs special non-conventional ACS configuration such that
minimal P2P routes needed for GPU Direct RDMA should be allowed.

Signed-off-by: Shanker Donthineni <sdonthineni@nvidia.com>
Signed-off-by: Tushar Dave <tdave@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 hw/arm/virt-acpi-build.c | 563 ++++++++++++++++++++++++++++++++++++++-
 hw/arm/virt.c            |  31 +++
 include/hw/arm/virt.h    |   1 +
 3 files changed, 588 insertions(+), 7 deletions(-)

diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c
index 05266af3ea0..aeb873d6faa 100644
--- a/hw/arm/virt-acpi-build.c
+++ b/hw/arm/virt-acpi-build.c
@@ -49,6 +49,8 @@
 #include "hw/cxl/cxl.h"
 #include "hw/pci/pcie_host.h"
 #include "hw/pci/pci.h"
+#include "hw/vfio/pci.h"
+#include "hw/pci/pci_bridge.h"
 #include "hw/pci/pci_bus.h"
 #include "hw/pci-host/gpex.h"
 #include "hw/arm/virt.h"
@@ -140,8 +142,551 @@ static void build_acpi0017(Aml *table)
     aml_append(table, scope);
 }
 
+typedef struct {
+    uint64_t addr;
+    uint64_t end;
+    uint64_t flags;
+} PhysBAR;
+
+typedef struct {
+    uint64_t wbase;
+    uint64_t wlimit;
+    uint64_t wbase64;
+    uint64_t wlimit64;
+    uint64_t rbase;
+    uint64_t rlimit;
+    uint64_t rsize;
+    uint64_t piobase;
+    bool     available;
+    bool     search_mmio64;
+    PCIDevice *dev;
+    PCIBus *bus;
+    struct GPEXConfig *cfg;
+    bool debug;
+} NVIDIACfg;
+
+#define IORESOURCE_PREFETCH     0x00002000    /* No side effects */
+#define IORESOURCE_MEM_64       0x00100000
+
+static void nvidia_get_bridge_window(PCIBus *bus, void *opaque)
+{
+    PCIDevice *bridge = pci_bridge_get_device(bus);
+    NVIDIACfg *ncfg = (NVIDIACfg *)opaque;
+    struct GPEXConfig *cfg = ncfg->cfg;
+
+    if (!bridge) {
+        ncfg->wbase = cfg->mmio32.base;
+        ncfg->wlimit = cfg->mmio32.base + cfg->mmio32.size - 1;
+        ncfg->wbase64 = cfg->mmio64.base;
+        ncfg->wlimit64 = cfg->mmio64.base + cfg->mmio64.size - 1;
+    } else {
+        ncfg->wbase = pci_bridge_get_base(bridge, PCI_BASE_ADDRESS_MEM_TYPE_32);
+        ncfg->wlimit = pci_bridge_get_limit(bridge, PCI_BASE_ADDRESS_MEM_TYPE_32);
+        ncfg->wbase64 = pci_bridge_get_base(bridge, PCI_BASE_ADDRESS_MEM_PREFETCH);
+        ncfg->wlimit64 = pci_bridge_get_limit(bridge, PCI_BASE_ADDRESS_MEM_PREFETCH);
+    }
+}
+
+static void nvidia_update_bridge_window(PCIBus *bus, uint64_t base, uint64_t limit)
+{
+    PCIDevice *bridge = pci_bridge_get_device(bus);
+    uint32_t value0, value1;
+
+    assert(bridge);
+
+    value0 = (uint32_t)(extract64(base, 20, 12) << 4);
+    value1 = (uint32_t)(extract64(limit, 20, 12) << 4);
+    pci_host_config_write_common(bridge,
+                                 PCI_PREF_MEMORY_BASE,
+                                 pci_config_size(bridge),
+                                 value0 | PCI_PREF_RANGE_TYPE_64,
+                                 2);
+    pci_host_config_write_common(bridge,
+                                 PCI_PREF_BASE_UPPER32,
+                                 pci_config_size(bridge),
+                                 (uint32_t)(base >> 32),
+                                 4);
+    pci_host_config_write_common(bridge,
+                                 PCI_PREF_MEMORY_LIMIT,
+                                 pci_config_size(bridge),
+                                 value1 | PCI_PREF_RANGE_TYPE_64,
+                                 2);
+    pci_host_config_write_common(bridge,
+                                 PCI_PREF_LIMIT_UPPER32,
+                                 pci_config_size(bridge),
+                                 (uint32_t)(limit >> 32),
+                                 4);
+}
+
+static void nvidia_dev_vfio(PCIBus *bus, PCIDevice *dev, void *opaque)
+{
+    struct GPEXConfig *cfg = (struct GPEXConfig *)opaque;
+    PhysBAR *pbar, pbars[PCI_ROM_SLOT];
+    char *tmp, *resources, line[128];
+    VFIOPCIDevice *vdev;
+    uint32_t laddr;
+    FILE *fp;
+    int idx;
+
+    if (!object_dynamic_cast(OBJECT(dev), TYPE_VFIO_PCI)) {
+        return;
+    }
+
+    vdev = VFIO_PCI_BASE(dev);
+
+    tmp = g_strdup_printf("%s/resource", vdev->vbasedev.sysfsdev);
+    resources = realpath(tmp, NULL);
+    g_free(tmp);
+
+    idx = 0;
+    pbar = pbars;
+    memset(pbar, 0, sizeof(pbars));
+
+    fp = fopen(resources, "r");
+    g_free(resources);
+    if (!fp) {
+        return;
+    }
+
+    do {
+        if (!fgets(line, sizeof(line), fp)) {
+            fclose(fp);
+            return;
+        }
+        sscanf(line, "0x%lx 0x%lx 0x%lx\n", &pbar->addr,
+               &pbar->end, &pbar->flags);
+        idx++;
+        pbar++;
+    } while (*line && idx < PCI_ROM_SLOT);
+
+    fclose(fp);
+
+    for (idx = 0, pbar = pbars; idx < PCI_ROM_SLOT; idx++, pbar++) {
+        if (!(pbar->flags & IORESOURCE_PREFETCH)) {
+            continue;
+        }
+        laddr = pbar->addr & PCI_BASE_ADDRESS_MEM_MASK ;
+        laddr |= PCI_BASE_ADDRESS_MEM_PREFETCH | PCI_BASE_ADDRESS_MEM_TYPE_64;
+        vfio_pci_write_config(dev,
+                              PCI_BASE_ADDRESS_0 + (idx * 4),
+                              laddr,
+                              4);
+        vfio_pci_write_config(dev,
+                              PCI_BASE_ADDRESS_0 + (idx * 4) + 4,
+                              (uint32_t)(pbar->addr >> 32),
+                              4);
+        cfg->preserve_config = true;
+    }
+}
+
+static void nvidia_bus_vfio(PCIBus *bus, void *opaque)
+{
+    pci_for_each_device_under_bus(bus, nvidia_dev_vfio, opaque);
+}
+
+static void nvidia_mmio64_window(PCIBus *bus, PCIDevice *dev, void *opaque)
+{
+    NVIDIACfg *ncfg = (NVIDIACfg *)opaque;
+    uint64_t rbase, rlimit;
+    uint32_t idx;
+
+    for (idx = 0; idx < PCI_ROM_SLOT; idx++) {
+        PCIIORegion *res = &dev->io_regions[idx];
+
+        if ((!res->size) ||
+            ((res->addr < ncfg->wbase64) || (res->addr > ncfg->wlimit64))) {
+            continue;
+        }
+        rbase = res->addr;
+        rlimit = res->addr + res->size - 1;
+        ncfg->rbase = MIN(ncfg->rbase, rbase);
+        ncfg->rlimit = MAX(ncfg->rlimit, rlimit);
+    }
+
+    if (IS_PCI_BRIDGE(dev)) {
+        rbase = pci_bridge_get_base(dev, PCI_BASE_ADDRESS_MEM_PREFETCH);
+        rlimit = pci_bridge_get_limit(dev, PCI_BASE_ADDRESS_MEM_PREFETCH);
+
+        if ((rbase < ncfg->wbase64) ||
+            (rbase > ncfg->wlimit64) ||
+            (rlimit < ncfg->wbase64) ||
+            (rlimit > ncfg->wlimit64)) {
+            return;
+        }
+
+        ncfg->rbase = MIN(ncfg->rbase, rbase);
+        ncfg->rlimit = MAX(ncfg->rlimit, rlimit);
+    }
+}
+
+static void nvidia_bus_update_bridge_window(PCIBus *bus, void *opaque)
+{
+    NVIDIACfg *ncfg = (NVIDIACfg *)opaque;
+    ncfg->rbase = ~0;
+    ncfg->rlimit = 0;
+
+    assert(pci_bridge_get_device(bus));
+    pci_for_each_device_under_bus(bus, nvidia_mmio64_window, ncfg);
+
+    if (ncfg->rlimit > ncfg->rbase) {
+        nvidia_update_bridge_window(bus, ncfg->rbase, ncfg->rlimit);
+    }
+}
+
+static void nvidia_dev_rom_max_size(PCIBus *bus, PCIDevice *dev, void *opaque)
+{
+    NVIDIACfg *ncfg = (NVIDIACfg *)opaque;
+    uint64_t base, size, rsize = 0;
+
+    size = dev->io_regions[PCI_ROM_SLOT].size;
+    if (!size) {
+        return;
+    }
+
+    base = pci_host_config_read_common(dev,
+                                       PCI_ROM_ADDRESS,
+                                       pci_config_size(dev),
+                                       4);
+    base &= ~(size - 1);
+    if ((base >= ncfg->wbase) &&
+        ((base + size - 1) <= ncfg->wlimit)) {
+        return;
+    }
+
+    if (size > rsize) {
+        ncfg->rsize = size;
+        ncfg->dev = dev;
+    }
+}
+
+static void nvidia_find_mmio_helper(PCIBus *bus, PCIDevice *dev, void *opaque)
+{
+    NVIDIACfg *ncfg = (NVIDIACfg *)opaque;
+    uint64_t base, limit, wbase, wlimit;
+    uint32_t idx;
+    PCIIORegion *res;
+
+    if (ncfg->search_mmio64) {
+        wbase = ncfg->wbase64;
+        wlimit = ncfg->wlimit64;
+    } else {
+        wbase = ncfg->wbase;
+        wlimit = ncfg->wlimit;
+    }
+
+    for (idx = 0; idx < PCI_NUM_REGIONS; idx++) {
+        res = &dev->io_regions[idx];
+        if ((!res->size) || (res->type & PCI_BASE_ADDRESS_SPACE_IO)) {
+            continue;
+        }
+
+        if (ncfg->search_mmio64) {
+            if ((!(res->type & PCI_BASE_ADDRESS_MEM_TYPE_64)) ||
+                (!(res->type & PCI_BASE_ADDRESS_MEM_PREFETCH))) {
+                continue;
+            }
+        }
+
+        if (idx == PCI_ROM_SLOT) {
+            base = pci_host_config_read_common(dev,
+                                               PCI_ROM_ADDRESS,
+                                               pci_config_size(dev),
+                                               4);
+        } else {
+            base = res->addr;
+        }
+
+        base &= ~(res->size - 1);
+        if ((base < wbase) || ((base + res->size - 1) > wlimit)) {
+            continue;
+        }
+
+        if (ranges_overlap(ncfg->rbase, ncfg->rsize, base, res->size)) {
+            ncfg->rbase = QEMU_ALIGN_UP(base + res->size, ncfg->rsize);
+            ncfg->rlimit = ncfg->rbase + ncfg->rsize - 1;
+            ncfg->available = false;
+        }
+    }
+
+    if (IS_PCI_BRIDGE(dev)) {
+
+        if (ncfg->search_mmio64) {
+            base = pci_bridge_get_base(dev, PCI_BASE_ADDRESS_MEM_PREFETCH);
+            limit = pci_bridge_get_limit(dev, PCI_BASE_ADDRESS_MEM_PREFETCH);
+        } else {
+            base = pci_bridge_get_base(dev, PCI_BASE_ADDRESS_MEM_TYPE_32);
+            limit = pci_bridge_get_limit(dev, PCI_BASE_ADDRESS_MEM_TYPE_32);
+        }
+
+        if ((base < wbase) || (limit > wlimit)) {
+            return;
+        }
+
+        if (ranges_overlap(ncfg->rbase, ncfg->rsize, base, limit - base + 1)) {
+            ncfg->rbase = QEMU_ALIGN_UP(limit + 1, ncfg->rsize);
+            ncfg->rlimit = ncfg->rbase + ncfg->rsize - 1;
+            ncfg->available = false;
+        }
+    }
+}
+
+static bool nvidia_find_mmio(PCIBus *bus, NVIDIACfg *ncfg)
+{
+    uint64_t wlimit;
+
+    if (ncfg->search_mmio64) {
+        ncfg->rbase = ncfg->wbase64;
+        wlimit = ncfg->wlimit64;
+    } else {
+        ncfg->rbase = ncfg->wbase;
+        wlimit = ncfg->wlimit;
+    }
+    ncfg->rlimit = ncfg->rbase + ncfg->rsize - 1;
+
+    while (ncfg->rlimit <= wlimit) {
+        ncfg->available = true;
+        pci_for_each_device_under_bus(bus, nvidia_find_mmio_helper, ncfg);
+        if (ncfg->available) {
+            return true;
+        }
+    }
+    return false;
+}
+
+static void nvidia_bus_adjust_mmio32_rom(PCIBus *bus, void *opaque)
+{
+    NVIDIACfg *ncfg = (NVIDIACfg *)opaque;
+
+    ncfg->search_mmio64 = false;
+    nvidia_get_bridge_window(bus, ncfg);
+
+    do {
+        ncfg->rsize = 0;
+        pci_for_each_device_under_bus(bus, nvidia_dev_rom_max_size, ncfg);
+        if (!ncfg->rsize)
+            break;
+        if (nvidia_find_mmio(bus, ncfg)) {
+            pci_host_config_write_common(ncfg->dev,
+                                         PCI_ROM_ADDRESS,
+                                         pci_config_size(ncfg->dev),
+                                         ncfg->rbase,
+                                         4);
+        }
+    } while (true);
+}
+
+
+static void nvidia_dev_shift_mmio64(PCIBus *bus, PCIDevice *dev, void *opaque)
+{
+    NVIDIACfg *ncfg = (NVIDIACfg *)opaque;
+    uint64_t addr;
+    uint32_t idx;
+
+    for (idx = 0; idx < PCI_ROM_SLOT; idx++) {
+        PCIIORegion *res = &dev->io_regions[idx];
+
+        if ((!res->size) ||
+            (!(res->type & PCI_BASE_ADDRESS_MEM_TYPE_64)) ||
+            (!(res->type & PCI_BASE_ADDRESS_MEM_PREFETCH))) {
+            continue;
+        }
+
+        addr = res->addr & PCI_BASE_ADDRESS_MEM_MASK;
+        if ((addr >= ncfg->wbase64) && (addr <= ncfg->wlimit64)) {
+            continue;
+        }
+
+        addr += ncfg->rbase;
+        addr |= PCI_BASE_ADDRESS_MEM_PREFETCH | PCI_BASE_ADDRESS_MEM_TYPE_64;
+
+        pci_host_config_write_common(dev,
+                                     PCI_BASE_ADDRESS_0 + (idx * 4),
+                                     pci_config_size(dev),
+                                     (uint32_t)(addr & 0xffffffff),
+                                     4);
+        pci_host_config_write_common(dev,
+                                     PCI_BASE_ADDRESS_0 + (idx * 4) + 4,
+                                     pci_config_size(dev),
+                                     (uint32_t)(addr >> 32),
+                                     4);
+    }
+}
+
+static void nvidia_dev_unassigned_mmio64(PCIBus *bus, PCIDevice *dev, void *opaque)
+{
+    NVIDIACfg *ncfg0 = (NVIDIACfg *)opaque;
+    struct GPEXConfig *cfg = ncfg0->cfg;
+    NVIDIACfg ncfg1, *ncfg = &ncfg1;
+    uint64_t base, limit;
+    PCIBus *sbus;
+
+    if (!IS_PCI_BRIDGE(dev)) {
+        return;
+    }
+
+    sbus = &PCI_BRIDGE(dev)->sec_bus;
+    memcpy(ncfg, ncfg0, sizeof(NVIDIACfg));
+    base = pci_bridge_get_base(dev, PCI_BASE_ADDRESS_MEM_PREFETCH);
+    limit = pci_bridge_get_limit(dev, PCI_BASE_ADDRESS_MEM_PREFETCH);
+
+    if ((base >= ncfg->wbase64) &&
+        (base <= ncfg->wlimit64) &&
+        (limit >= ncfg->wbase64) &&
+        (limit <= ncfg->wlimit64)) {
+        return;
+    }
+
+    ncfg->rsize = base >= limit ? 0x100000 : limit - base + 1;
+    ncfg->search_mmio64 = true;
+    nvidia_get_bridge_window(bus, ncfg);
+
+    /* Check if the required space is free in the parent bus */
+    if (!nvidia_find_mmio(bus, ncfg)) {
+
+        /* Try with the extended parent window */
+        ncfg->rbase = QEMU_ALIGN_UP(ncfg->wlimit64 + 1, ncfg->rsize);
+        ncfg->wlimit64 = ncfg->rbase + ncfg->rsize - 1;
+        /* TODO: check conflicts with the extended window */
+    }
+
+    if (base >= limit) {
+        nvidia_update_bridge_window(sbus, ncfg->rbase, ncfg->rlimit);
+    } else {
+        ncfg->rbase -= base;
+        pci_for_each_device_under_bus(sbus, nvidia_dev_shift_mmio64, ncfg);
+    }
+
+    ncfg->wbase64 = cfg->mmio64.base + cfg->mmio64.size / 2;
+    ncfg->wlimit64 = ncfg->wbase64 + (cfg->mmio64.size / 2) - 1;
+    pci_for_each_bus(ncfg->bus, nvidia_bus_update_bridge_window, ncfg);
+}
+
+static void nvidia_bus_unassigned_mmio64(PCIBus *bus, void *opaque)
+{
+    pci_for_each_device_under_bus(bus, nvidia_dev_unassigned_mmio64, opaque);
+}
+
+static void nvidia_dev_assign_pio(PCIBus *bus, PCIDevice *dev, void *opaque)
+{
+    NVIDIACfg *ncfg = (NVIDIACfg *)opaque;
+    struct GPEXConfig *cfg = ncfg->cfg;
+    PCIIORegion *res;
+    uint32_t idx;
+
+    for (idx = 0; idx < PCI_NUM_REGIONS; idx++) {
+        res = &dev->io_regions[idx];
+
+        if ((!res->size) || (!(res->type & PCI_BASE_ADDRESS_SPACE_IO))) {
+            continue;
+        }
+        ncfg->piobase = QEMU_ALIGN_UP(ncfg->piobase, res->size);
+        pci_host_config_write_common(dev,
+                                 PCI_BASE_ADDRESS_0 + (idx * 4),
+                                 pci_config_size(dev),
+                                 (uint32_t)(ncfg->piobase - cfg->pio.base),
+                                 4);
+        ncfg->piobase += res->size;
+    }
+}
+
+static void nvidia_pio_window(PCIBus *bus, PCIDevice *dev, void *opaque)
+{
+    NVIDIACfg *ncfg = (NVIDIACfg *)opaque;
+    uint64_t rbase, rlimit;
+    uint32_t idx;
+
+    for (idx = 0; idx < PCI_ROM_SLOT; idx++) {
+        PCIIORegion *res = &dev->io_regions[idx];
+
+        if ((!res->size) || (!(res->type & PCI_BASE_ADDRESS_SPACE_IO))) {
+            continue;
+        }
+
+        rbase = res->addr;
+        rlimit = res->addr + res->size - 1;
+        ncfg->rbase = MIN(ncfg->rbase, rbase);
+        ncfg->rlimit = MAX(ncfg->rlimit, rlimit);
+    }
+
+    if (IS_PCI_BRIDGE(dev)) {
+        rbase = pci_bridge_get_base(dev, PCI_BASE_ADDRESS_SPACE_IO);
+        rlimit = pci_bridge_get_limit(dev, PCI_BASE_ADDRESS_SPACE_IO);
+
+        ncfg->rbase = MIN(ncfg->rbase, rbase);
+        ncfg->rlimit = MAX(ncfg->rlimit, rlimit);
+    }
+}
+
+static void nvidia_bus_assign_pio(PCIBus *bus, void *opaque)
+{
+    PCIDevice *bridge = pci_bridge_get_device(bus);
+    NVIDIACfg *ncfg = (NVIDIACfg *)opaque;
+    uint32_t value0, value1;
+
+    ncfg->piobase = QEMU_ALIGN_UP(ncfg->piobase, 0x1000);
+    pci_for_each_device_under_bus(bus, nvidia_dev_assign_pio, ncfg);
+    if (!bridge) {
+        return;
+    }
+
+    ncfg->rbase = ~0;
+    ncfg->rlimit = 0;
+    pci_for_each_device_under_bus(bus, nvidia_pio_window, ncfg);
+
+    if (ncfg->rbase > ncfg->rlimit) {
+        ncfg->rbase = QEMU_ALIGN_UP(ncfg->piobase, 0x1000);
+        ncfg->piobase += 0x1000;
+        ncfg->rlimit = ncfg->piobase - 1;
+    }
+
+    value0 = (uint32_t)(extract64(ncfg->rbase, 12, 4) << 4);
+    value1 = (uint32_t)(extract64(ncfg->rlimit, 12, 4) << 4);
+
+    pci_host_config_write_common(bridge,
+                                 PCI_IO_BASE,
+                                 pci_config_size(bridge),
+                                 value0 | PCI_IO_RANGE_TYPE_16,
+                                 1);
+    pci_host_config_write_common(bridge,
+                                 PCI_IO_LIMIT,
+                                 pci_config_size(bridge),
+                                 value1 | PCI_IO_RANGE_TYPE_16,
+                                 1);
+}
+
+static void nvidia_prepare_mmio64_identity(struct GPEXConfig *cfg)
+{
+    NVIDIACfg ncfg1, *ncfg = &ncfg1;
+    PCIBus *bus = cfg->bus;
+
+    pci_for_each_bus(bus, nvidia_bus_vfio, cfg);
+    if (!cfg->preserve_config) {
+        return;
+    }
+
+    memset(ncfg, 0, sizeof(NVIDIACfg));
+    ncfg->cfg = cfg;
+
+    nvidia_get_bridge_window(bus, ncfg);
+    pci_for_each_bus(bus, nvidia_bus_adjust_mmio32_rom, ncfg);
+
+    ncfg->piobase = cfg->pio.base;
+    pci_for_each_bus(bus, nvidia_bus_assign_pio, ncfg);
+
+    nvidia_get_bridge_window(bus, ncfg);
+
+    QLIST_FOREACH(bus, &bus->child, sibling) {
+        ncfg->bus = bus;
+        ncfg->wbase64 = cfg->mmio64.base + cfg->mmio64.size / 2;
+        ncfg->wlimit64 = ncfg->wbase64 + (cfg->mmio64.size / 2) - 1;
+
+        pci_for_each_bus(bus, nvidia_bus_update_bridge_window, ncfg);
+        pci_for_each_bus(bus, nvidia_bus_unassigned_mmio64, ncfg);
+    }
+}
+
 static void acpi_dsdt_add_pci(Aml *scope, const MemMapEntry *memmap,
-                              uint32_t irq, VirtMachineState *vms)
+                              uint32_t irq, VirtMachineState *vms, bool update)
 {
     int ecam_id = VIRT_ECAM_ID(vms->highmem_ecam);
     bool cxl_present = false;
@@ -173,6 +718,10 @@ static void acpi_dsdt_add_pci(Aml *scope, const MemMapEntry *memmap,
 
     if (vms->highmem_mmio) {
         cfg.mmio64 = memmap[VIRT_HIGH_PCIE_MMIO];
+
+        if (vms->grace_pcie_mmio_identity && update) {
+            nvidia_prepare_mmio64_identity(&cfg);
+        }
     }
 
     acpi_dsdt_add_gpex(scope, &cfg);
@@ -1143,7 +1692,7 @@ static int acpi_dsdt_add_cmdqv(Aml *scope, GArray *smmuv3_devs)
 
 /* DSDT */
 static void
-build_dsdt(GArray *table_data, AcpiBuildTables *tables, VirtMachineState *vms)
+build_dsdt(GArray *table_data, AcpiBuildTables *tables, VirtMachineState *vms, bool update)
 {
     VirtMachineClass *vmc = VIRT_MACHINE_GET_CLASS(vms);
     GArray *smmuv3_devs = tables->smmuv3_devs;
@@ -1179,7 +1728,7 @@ build_dsdt(GArray *table_data, AcpiBuildTables *tables, VirtMachineState *vms)
     virtio_acpi_dsdt_add(scope, memmap[VIRT_MMIO].base, memmap[VIRT_MMIO].size,
                          (irqmap[VIRT_MMIO] + ARM_SPI_BASE),
                          0, NUM_VIRTIO_TRANSPORTS);
-    acpi_dsdt_add_pci(scope, memmap, irqmap[VIRT_PCIE] + ARM_SPI_BASE, vms);
+    acpi_dsdt_add_pci(scope, memmap, irqmap[VIRT_PCIE] + ARM_SPI_BASE, vms, update);
     if (vms->acpi_dev) {
         build_ged_aml(scope, "\\_SB."GED_DEVICE,
                       HOTPLUG_HANDLER(vms->acpi_dev),
@@ -1274,7 +1823,7 @@ static void virt_acpi_prebuild(VirtMachineState *vms, AcpiBuildTables *tables)
 }
 
 static
-void virt_acpi_build(VirtMachineState *vms, AcpiBuildTables *tables)
+void virt_acpi_build(VirtMachineState *vms, AcpiBuildTables *tables, bool update)
 {
     VirtMachineClass *vmc = VIRT_MACHINE_GET_CLASS(vms);
     GArray *table_offsets;
@@ -1292,7 +1841,7 @@ void virt_acpi_build(VirtMachineState *vms, AcpiBuildTables *tables)
 
     /* DSDT is pointed to by FADT */
     dsdt = tables_blob->len;
-    build_dsdt(tables_blob, tables, vms);
+    build_dsdt(tables_blob, tables, vms, update);
 
     /* FADT MADT PPTT GTDT MCFG SPCR DBG2 pointed to by RSDT */
     acpi_add_table(table_offsets, tables_blob);
@@ -1438,7 +1987,7 @@ static void virt_acpi_build_update(void *build_opaque)
 
     acpi_build_tables_init(&tables);
 
-    virt_acpi_build(VIRT_MACHINE(qdev_get_machine()), &tables);
+    virt_acpi_build(VIRT_MACHINE(qdev_get_machine()), &tables, true);
 
     acpi_ram_update(build_state->table_mr, tables.table_data);
     acpi_ram_update(build_state->rsdp_mr, tables.rsdp);
@@ -1482,7 +2031,7 @@ void virt_acpi_setup(VirtMachineState *vms)
     build_state = g_malloc0(sizeof *build_state);
 
     acpi_build_tables_init(&tables);
-    virt_acpi_build(vms, &tables);
+    virt_acpi_build(vms, &tables, false);
 
     /* Now expose it all to Guest */
     build_state->table_mr = acpi_add_rom_blob(virt_acpi_build_update,
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 418ed77debf..8bced5ea60c 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -1693,6 +1693,9 @@ static void create_pcie(VirtMachineState *vms)
                                  2, base_ecam, 2, size_ecam);
 
     if (vms->highmem_mmio) {
+        if (vms->grace_pcie_mmio_identity && virt_is_acpi_enabled(vms)) {
+            size_mmio_high = size_mmio_high >> 1;
+        }
         qemu_fdt_setprop_sized_cells(ms->fdt, nodename, "ranges",
                                      1, FDT_PCI_RANGE_IOPORT, 2, 0,
                                      2, base_pio, 2, size_pio,
@@ -1955,6 +1958,12 @@ static void virt_set_high_memmap(VirtMachineState *vms,
             vms->highest_gpa = base - 1;
         }
     }
+
+    if (vms->grace_pcie_mmio_identity) {
+        vms->highest_gpa = BIT_ULL(pa_bits) - 1;
+        vms->memmap[VIRT_HIGH_PCIE_MMIO].base = 0x400000000000;
+        vms->memmap[VIRT_HIGH_PCIE_MMIO].size = 0x400000000000;
+    }
 }
 
 static void virt_set_memmap(VirtMachineState *vms, int pa_bits)
@@ -2793,6 +2802,20 @@ static void virt_set_oem_table_id(Object *obj, const char *value,
     strncpy(vms->oem_table_id, value, 8);
 }
 
+static bool virt_get_grace_pcie_mmio_identity(Object *obj, Error **errp)
+{
+    VirtMachineState *vms = VIRT_MACHINE(obj);
+
+    return vms->grace_pcie_mmio_identity;
+}
+
+static void virt_set_grace_pcie_mmio_identity(Object *obj, bool value,
+                                              Error **errp)
+{
+    VirtMachineState *vms = VIRT_MACHINE(obj);
+
+    vms->grace_pcie_mmio_identity = value;
+}
 
 bool virt_is_acpi_enabled(VirtMachineState *vms)
 {
@@ -3498,6 +3521,14 @@ static void virt_machine_class_init(ObjectClass *oc, const void *data)
                                           "in ACPI table header."
                                           "The string may be up to 8 bytes in size");
 
+    object_class_property_add_bool(oc, "grace-pcie-mmio-identity",
+                                   virt_get_grace_pcie_mmio_identity,
+                                   virt_set_grace_pcie_mmio_identity);
+    object_class_property_set_description(oc, "grace-pcie-mmio-identity",
+                                          "Set on/off to enable/disable "
+                                          "mapping PCIe 64bit BARs with "
+                                          "HPA = IPA for pass-through devices");
+
 }
 
 static void virt_instance_init(Object *obj)
diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h
index 0963356fc26..6f375e1b018 100644
--- a/include/hw/arm/virt.h
+++ b/include/hw/arm/virt.h
@@ -182,6 +182,7 @@ struct VirtMachineState {
     CXLState cxl_devices_state;
     bool legacy_smmuv3_present;
     bool pci_preserve_config;
+    bool grace_pcie_mmio_identity;
 };
 
 #define VIRT_ECAM_ID(high) (high ? VIRT_HIGH_PCIE_ECAM : VIRT_PCIE_ECAM)

From 2946d5f81901aaf36f38c1cd9ecb4284f1c22467 Mon Sep 17 00:00:00 2001
From: Tushar Dave <tdave@nvidia.com>
Date: Tue, 19 Nov 2024 00:45:33 +0000
Subject: [PATCH 2/7] NVIDIA: SAUCE: hw/arm: GB200 workaround for GPU BAR1 HPA

Grace Blackwell GPU PCIe BAR1 is real BAR exposed to VM that can be
used for GPUdirect RDMA [1].

This patch assigns HPA to BAR1 in the VM for the reason mentioned in
the commit 54db2e4a632 ("hw/arm: GB200 DirectNIC GPA=HPA").

This patch also assigns appropriate GPA to GPU BAR2 (exposed to VM with
the same size as BAR 1 that emulates C2C cache coherent address space)
to avoid region conflict in PCI bus resource assignment.

[1]: https://lore.kernel.org/lkml/20241006102722.3991-1-ankita@nvidia.com/

Signed-off-by: Tushar Dave <tdave@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 hw/arm/virt-acpi-build.c | 46 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 45 insertions(+), 1 deletion(-)

diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c
index aeb873d6faa..0d5d959a16c 100644
--- a/hw/arm/virt-acpi-build.c
+++ b/hw/arm/virt-acpi-build.c
@@ -218,15 +218,51 @@ static void nvidia_update_bridge_window(PCIBus *bus, uint64_t base, uint64_t lim
                                  4);
 }
 
+static void fix_pci_bar_GB200_nvidia(PCIDevice *dev, PhysBAR *pbars)
+{
+    PhysBAR *pbar = pbars;
+    bool overlap;
+    int idx;
+
+    for (idx = PCI_ROM_SLOT - 1 ; idx >= 0; idx--) {
+        if (!(pbar[idx].flags & IORESOURCE_PREFETCH))
+            continue;
+
+        pbar[idx].addr = pbars[idx].addr;
+        pbar[idx].end = pbar[idx].addr + dev->io_regions[idx].size - 1;
+    }
+
+    /* Make sure BAR1 gets GPA=HPA, adjust other two BARs accordingly to avoind region conflict */
+    overlap = true;
+    while (overlap) {
+        overlap = false;
+
+        for (idx = 0; idx <=  PCI_ROM_SLOT - 1; idx++) {
+            if (!(pbar[idx].flags & IORESOURCE_PREFETCH))
+                continue;
+
+            for (int j = 0; j < PCI_ROM_SLOT; j++) {
+                if (!(pbar[j].flags & IORESOURCE_PREFETCH) || idx == j)
+                    continue;
+
+                if(ranges_overlap(pbar[idx].addr,  dev->io_regions[idx].size, pbar[j].addr, dev->io_regions[j].size)) {
+                    pbar[j].addr = QEMU_ALIGN_UP(pbar[idx].addr +  dev->io_regions[idx].size, dev->io_regions[j].size);
+                    overlap = true;
+               }
+            }
+        }
+    }
+}
+
 static void nvidia_dev_vfio(PCIBus *bus, PCIDevice *dev, void *opaque)
 {
     struct GPEXConfig *cfg = (struct GPEXConfig *)opaque;
     PhysBAR *pbar, pbars[PCI_ROM_SLOT];
     char *tmp, *resources, line[128];
+    int idx, vendor_id, device_id;
     VFIOPCIDevice *vdev;
     uint32_t laddr;
     FILE *fp;
-    int idx;
 
     if (!object_dynamic_cast(OBJECT(dev), TYPE_VFIO_PCI)) {
         return;
@@ -261,6 +297,14 @@ static void nvidia_dev_vfio(PCIBus *bus, PCIDevice *dev, void *opaque)
 
     fclose(fp);
 
+    vendor_id = pci_get_word(dev->config + PCI_VENDOR_ID);
+    device_id = pci_get_word(dev->config + PCI_DEVICE_ID);
+
+    /* Nvidia GB200 workaround */
+    if (vendor_id == 0x10de && device_id == 0x2941) {
+        fix_pci_bar_GB200_nvidia(dev, pbars);
+    }
+
     for (idx = 0, pbar = pbars; idx < PCI_ROM_SLOT; idx++, pbar++) {
         if (!(pbar->flags & IORESOURCE_PREFETCH)) {
             continue;

From 572888e8232b7a545629aa0ba8d2dcb15a0aff01 Mon Sep 17 00:00:00 2001
From: Tushar Dave <tdave@nvidia.com>
Date: Mon, 25 Aug 2025 21:02:51 +0000
Subject: [PATCH 3/7] NVIDIA: SAUCE: hw/arm: GB300 workaround for GPU BAR1
 GPA=HPA

Simialr to GB200, GB300 also requires workaround to make GPU BAR 1 GPA=HPA.

Signed-off-by: Tushar Dave <tdave@nvidia.com>
---
 hw/arm/virt-acpi-build.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c
index 0d5d959a16c..c99e0013772 100644
--- a/hw/arm/virt-acpi-build.c
+++ b/hw/arm/virt-acpi-build.c
@@ -300,8 +300,8 @@ static void nvidia_dev_vfio(PCIBus *bus, PCIDevice *dev, void *opaque)
     vendor_id = pci_get_word(dev->config + PCI_VENDOR_ID);
     device_id = pci_get_word(dev->config + PCI_DEVICE_ID);
 
-    /* Nvidia GB200 workaround */
-    if (vendor_id == 0x10de && device_id == 0x2941) {
+    /* Nvidia GB200/GB300 workaround */
+    if ((vendor_id == 0x10de && device_id == 0x2941) || (vendor_id == 0x10de && device_id == 0x31c2)) {
         fix_pci_bar_GB200_nvidia(dev, pbars);
     }
 

From ec9eaaddacbc366ed516eff80dbdee16bd7e03ee Mon Sep 17 00:00:00 2001
From: Tushar Dave <tdave@nvidia.com>
Date: Wed, 30 Jul 2025 12:44:42 -0700
Subject: [PATCH 4/7] NVIDIA: SAUCE: xio3130_downstream: Add ACS support for
 downstream PCIe ports

When PASID capable device is added behind the PCIe downstream port,
for example Nvidia GPU, the PCIe downstream ports must expose ACS capability
otherwise PASID won't get enabled.

In addition, the other usecase is GPUDirect RDMA using Data Direct that
must require special ACS controls at the PCIe downstream ports for
P2P communication.

Signed-off-by: Tushar Dave <tdave@nvidia.com>
---
 hw/pci-bridge/xio3130_downstream.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/hw/pci-bridge/xio3130_downstream.c b/hw/pci-bridge/xio3130_downstream.c
index dc7d1aa7d77..44a6f12a34f 100644
--- a/hw/pci-bridge/xio3130_downstream.c
+++ b/hw/pci-bridge/xio3130_downstream.c
@@ -40,6 +40,8 @@
 #define XIO3130_SSVID_SSID              0
 #define XIO3130_EXP_OFFSET              0x90
 #define XIO3130_AER_OFFSET              0x100
+#define XIO3130_ACS_OFFSET \
+	(XIO3130_AER_OFFSET + PCI_ERR_SIZEOF)
 
 static void xio3130_downstream_write_config(PCIDevice *d, uint32_t address,
                                          uint32_t val, int len)
@@ -60,6 +62,7 @@ static void xio3130_downstream_reset(DeviceState *qdev)
     pcie_cap_deverr_reset(d);
     pcie_cap_slot_reset(d);
     pcie_cap_arifwd_reset(d);
+    pcie_acs_reset(d);
     pci_bridge_reset(qdev);
 }
 
@@ -111,6 +114,8 @@ static void xio3130_downstream_realize(PCIDevice *d, Error **errp)
         goto err;
     }
 
+    pcie_acs_init(d, XIO3130_ACS_OFFSET);
+
     return;
 
 err:

From 1ecaf2780ad5f812fa1670b8e29502581bf87e48 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Wed, 16 Jul 2025 10:35:07 +0300
Subject: [PATCH 5/7] NVIDIA: SAUCE: Expose ACS caps on Guest as configured by
 the HYP

To support P2P on Guest we must expose to the guest OS the actual PCIe
topology and configuration as set by the HYP.

Otherwise, the behavior is considered as un-defined.

It might fail by SW or HW.

Extend both root port and downstream port to get acs caps that should
match the HYP and use them in the guest.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Signed-off-by: Tushar Dave <tdave@nvidia.com>
---
 hw/pci-bridge/pcie_root_port.c     | 10 ++++++++--
 hw/pci-bridge/xio3130_downstream.c |  9 +++++++--
 hw/pci/pcie.c                      | 16 +++++++++++++---
 include/hw/pci/pcie.h              |  4 ++--
 include/hw/pci/pcie_port.h         |  1 +
 5 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/hw/pci-bridge/pcie_root_port.c b/hw/pci-bridge/pcie_root_port.c
index 22c2fdb71e7..acc3f8d89a1 100644
--- a/hw/pci-bridge/pcie_root_port.c
+++ b/hw/pci-bridge/pcie_root_port.c
@@ -46,6 +46,7 @@ static void rp_write_config(PCIDevice *d, uint32_t address,
 static void rp_reset_hold(Object *obj, ResetType type)
 {
     PCIDevice *d = PCI_DEVICE(obj);
+    PCIEPort *p = PCIE_PORT(d);
     DeviceState *qdev = DEVICE(obj);
 
     rp_aer_vector_update(d);
@@ -53,7 +54,7 @@ static void rp_reset_hold(Object *obj, ResetType type)
     pcie_cap_deverr_reset(d);
     pcie_cap_slot_reset(d);
     pcie_cap_arifwd_reset(d);
-    pcie_acs_reset(d);
+    pcie_acs_reset(d, p->acs_caps);
     pcie_aer_root_reset(d);
     pci_bridge_reset(qdev);
     pci_bridge_disable_base_limit(d);
@@ -118,7 +119,10 @@ static void rp_realize(PCIDevice *d, Error **errp)
     rp_aer_vector_update(d);
 
     if (rpc->acs_offset && !s->disable_acs) {
-        pcie_acs_init(d, rpc->acs_offset);
+        rc = pcie_acs_init(d, rpc->acs_offset, p->acs_caps, errp);
+        if (rc < 0) {
+            goto err;
+        }
     }
     return;
 
@@ -152,6 +156,8 @@ static const Property rp_props[] = {
     DEFINE_PROP_BIT(COMPAT_PROP_PCP, PCIDevice, cap_present,
                     QEMU_PCIE_SLTCAP_PCP_BITNR, true),
     DEFINE_PROP_BOOL("disable-acs", PCIESlot, disable_acs, false),
+    DEFINE_PROP_UINT16("acs-caps", PCIEPort,
+                       acs_caps, 0),
 };
 
 static void rp_instance_post_init(Object *obj)
diff --git a/hw/pci-bridge/xio3130_downstream.c b/hw/pci-bridge/xio3130_downstream.c
index 44a6f12a34f..9674c06c0b7 100644
--- a/hw/pci-bridge/xio3130_downstream.c
+++ b/hw/pci-bridge/xio3130_downstream.c
@@ -58,11 +58,12 @@ static void xio3130_downstream_write_config(PCIDevice *d, uint32_t address,
 static void xio3130_downstream_reset(DeviceState *qdev)
 {
     PCIDevice *d = PCI_DEVICE(qdev);
+    PCIEPort *p = PCIE_PORT(d);
 
     pcie_cap_deverr_reset(d);
     pcie_cap_slot_reset(d);
     pcie_cap_arifwd_reset(d);
-    pcie_acs_reset(d);
+    pcie_acs_reset(d, p->acs_caps);
     pci_bridge_reset(qdev);
 }
 
@@ -114,7 +115,10 @@ static void xio3130_downstream_realize(PCIDevice *d, Error **errp)
         goto err;
     }
 
-    pcie_acs_init(d, XIO3130_ACS_OFFSET);
+    rc = pcie_acs_init(d, XIO3130_ACS_OFFSET, p->acs_caps, errp);
+    if (rc < 0) {
+        goto err;
+    }
 
     return;
 
@@ -142,6 +146,7 @@ static void xio3130_downstream_exitfn(PCIDevice *d)
 static const Property xio3130_downstream_props[] = {
     DEFINE_PROP_BIT(COMPAT_PROP_PCP, PCIDevice, cap_present,
                     QEMU_PCIE_SLTCAP_PCP_BITNR, true),
+    DEFINE_PROP_UINT16("acs-caps", PCIEPort, acs_caps, 0),
 };
 
 static const VMStateDescription vmstate_xio3130_downstream = {
diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
index eaeb68894e6..3c6b9602f38 100644
--- a/hw/pci/pcie.c
+++ b/hw/pci/pcie.c
@@ -1178,7 +1178,7 @@ void pcie_ats_init(PCIDevice *dev, uint16_t offset, bool aligned)
 }
 
 /* ACS (Access Control Services) */
-void pcie_acs_init(PCIDevice *dev, uint16_t offset)
+int pcie_acs_init(PCIDevice *dev, uint16_t offset, uint16_t ctrl_bits, Error **errp)
 {
     bool is_downstream = pci_is_express_downstream_port(dev);
     uint16_t cap_bits = 0;
@@ -1202,16 +1202,26 @@ void pcie_acs_init(PCIDevice *dev, uint16_t offset)
          */
         cap_bits = PCI_ACS_SV | PCI_ACS_TB | PCI_ACS_RR |
             PCI_ACS_CR | PCI_ACS_UF | PCI_ACS_DT;
+
+        if (ctrl_bits & ~cap_bits) {
+             error_setg(errp, "Unsupported ACS capabilities 0x%hx were supplied. "
+                        "Supported capabilities are 0x%hx", ctrl_bits & ~cap_bits,
+                        cap_bits);
+            return -EINVAL;
+        }
     }
 
     pci_set_word(dev->config + offset + PCI_ACS_CAP, cap_bits);
     pci_set_word(dev->wmask + offset + PCI_ACS_CTRL, cap_bits);
+    pci_set_word(dev->config + offset + PCI_ACS_CTRL, ctrl_bits);
+
+    return 0;
 }
 
-void pcie_acs_reset(PCIDevice *dev)
+void pcie_acs_reset(PCIDevice *dev, uint16_t val)
 {
     if (dev->exp.acs_cap) {
-        pci_set_word(dev->config + dev->exp.acs_cap + PCI_ACS_CTRL, 0);
+        pci_set_word(dev->config + dev->exp.acs_cap + PCI_ACS_CTRL, val);
     }
 }
 
diff --git a/include/hw/pci/pcie.h b/include/hw/pci/pcie.h
index ff6ce08e135..fde4942feb8 100644
--- a/include/hw/pci/pcie.h
+++ b/include/hw/pci/pcie.h
@@ -135,8 +135,8 @@ void pcie_add_capability(PCIDevice *dev,
                          uint16_t offset, uint16_t size);
 void pcie_sync_bridge_lnk(PCIDevice *dev);
 
-void pcie_acs_init(PCIDevice *dev, uint16_t offset);
-void pcie_acs_reset(PCIDevice *dev);
+int pcie_acs_init(PCIDevice *dev, uint16_t offset, uint16_t ctrl_bits, Error **errp);
+void pcie_acs_reset(PCIDevice *dev, uint16_t val);
 
 void pcie_ari_init(PCIDevice *dev, uint16_t offset);
 void pcie_dev_ser_num_init(PCIDevice *dev, uint16_t offset, uint64_t ser_num);
diff --git a/include/hw/pci/pcie_port.h b/include/hw/pci/pcie_port.h
index 7cd7af8cfa4..4cb40f3226e 100644
--- a/include/hw/pci/pcie_port.h
+++ b/include/hw/pci/pcie_port.h
@@ -36,6 +36,7 @@ struct PCIEPort {
 
     /* pci express switch port */
     uint8_t     port;
+    uint16_t    acs_caps;
 };
 
 void pcie_port_init_reg(PCIDevice *d);

From aa060214e3bcadceff8183ab10f503325f16e030 Mon Sep 17 00:00:00 2001
From: Tushar Dave <tdave@nvidia.com>
Date: Tue, 26 Aug 2025 18:24:25 +0000
Subject: [PATCH 6/7] NVIDIA: SAUCE: pcie/acs: Make ACS Control read-only to
 guests

GPUDirect RDMA uisng data-direct requires a specific ACS configuration
on PCIe Root Ports and Downstream Ports.

While ACS can be configured via QEMU's 'acs-caps' property, the guest
kernel may overwrite ACS during standard programming.

This change blocks all guest writes to the PCIe ACS Control register and
preserves QEMU-provided ACS settings across device resets on PCIe Root Ports
and Downstream Ports.

Signed-off-by: Tushar Dave <tdave@nvidia.com>
---
 hw/pci/pcie.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
index 3c6b9602f38..738eb2e24d4 100644
--- a/hw/pci/pcie.c
+++ b/hw/pci/pcie.c
@@ -1182,6 +1182,7 @@ int pcie_acs_init(PCIDevice *dev, uint16_t offset, uint16_t ctrl_bits, Error **e
 {
     bool is_downstream = pci_is_express_downstream_port(dev);
     uint16_t cap_bits = 0;
+    PCIEPort *p = PCIE_PORT(dev);
 
     /* For endpoints, only multifunction devs may have an ACS capability: */
     assert(is_downstream ||
@@ -1212,7 +1213,14 @@ int pcie_acs_init(PCIDevice *dev, uint16_t offset, uint16_t ctrl_bits, Error **e
     }
 
     pci_set_word(dev->config + offset + PCI_ACS_CAP, cap_bits);
-    pci_set_word(dev->wmask + offset + PCI_ACS_CTRL, cap_bits);
+
+    if (is_downstream && p->acs_caps) {
+        /* Block guest writes to ACS Control entirely to preserve QEMU ACS settings */
+        pci_set_word(dev->wmask + offset + PCI_ACS_CTRL, 0);
+    } else {
+        pci_set_word(dev->wmask + offset + PCI_ACS_CTRL, cap_bits);
+    }
+
     pci_set_word(dev->config + offset + PCI_ACS_CTRL, ctrl_bits);
 
     return 0;

From bbbdeb8ad32a91b881aaf5c16298a25f6575946d Mon Sep 17 00:00:00 2001
From: Tushar Dave <tdave@nvidia.com>
Date: Tue, 26 Aug 2025 23:06:14 +0000
Subject: [PATCH 7/7] NVIDIA: SAUCE: README: GB200/GB300 GPUDirect RDMA using
 Data Direct

Testing command for GB200/GB300 GPUDirect RDMA usign Nvidia GPU, CX8 and Data Direct Interface:

qemu-system-aarch64 \
        -object iommufd,id=iommufd0 \
        -machine hmat=on -machine virt,accel=kvm,gic-version=3,ras=on,grace-pcie-mmio-identity=on,highmem-mmio-size=4T \
        -cpu host -smp cpus=16 -m size=16G,slots=2,maxmem=256G -nographic \
        -object memory-backend-ram,size=8G,id=m0 \
        -object memory-backend-ram,size=8G,id=m1 \
        -numa node,memdev=m0,cpus=0-15,nodeid=0 -numa node,memdev=m1,nodeid=1 \
        -numa node,nodeid=2 -numa node,nodeid=3 -numa node,nodeid=4 -numa node,nodeid=5\
        -numa node,nodeid=6 -numa node,nodeid=7 -numa node,nodeid=8 -numa node,nodeid=9\
        -device pxb-pcie,id=pcie.1,bus_nr=1,bus=pcie.0 \
        -device arm-smmuv3,primary-bus=pcie.1,id=smmuv3.1,accel=on,ats=on,ril=off,pasid=on,oas=48,cmdqv=on \
        -device pcie-root-port,id=pcie.port1,bus=pcie.1,chassis=1,io-reserve=0,acs-caps=0x1C \
        -device x3130-upstream,id=upstream1,bus=pcie.port1 \
        -device xio3130-downstream,id=downstream1_1,bus=upstream1,chassis=1,slot=1,acs-caps=0x19 \
        -device vfio-pci,host=0018:03:00.0,bus=downstream1_1,id=dmapf1,iommufd=iommufd0 \
        -device xio3130-downstream,id=downstream1_2,bus=upstream1,chassis=1,slot=2,acs-caps=0x15 \
        -device vfio-pci-nohotplug,host=0018:06:00.0,bus=downstream1_2,rombar=0,id=dev0,iommufd=iommufd0 \
        -object acpi-generic-initiator,id=gi0,pci-dev=dev0,node=2 \
        -object acpi-generic-initiator,id=gi1,pci-dev=dev0,node=3 \
        -object acpi-generic-initiator,id=gi2,pci-dev=dev0,node=4 \
        -object acpi-generic-initiator,id=gi3,pci-dev=dev0,node=5 \
        -object acpi-generic-initiator,id=gi4,pci-dev=dev0,node=6 \
        -object acpi-generic-initiator,id=gi5,pci-dev=dev0,node=7 \
        -object acpi-generic-initiator,id=gi6,pci-dev=dev0,node=8 \
        -object acpi-generic-initiator,id=gi7,pci-dev=dev0,node=9 \
        -bios /usr/share/AAVMF/AAVMF_CODE.fd \
        -device nvme,drive=nvme0,serial=deadbeaf1,bus=pcie.0 \
        -drive file=/home/nvidia/tushar/tushar/ubuntu-24.04-server-cloudimg-arm64-grace-6.14.0-1007-nvidia-64k.qcow2,index=0,media=disk,format=qcow2,if=none,id=nvme0 \
        -device e1000,netdev=net0,bus=pcie.0 \
        -device pxb-pcie,id=pcie.9,bus_nr=9,bus=pcie.0 \
        -device arm-smmuv3,primary-bus=pcie.9,id=smmuv3.2,accel=on,ats=on,ril=off,pasid=on,oas=48,cmdqv=on \
        -device pcie-root-port,id=pcie.port9,bus=pcie.9,chassis=4,io-reserve=0 \
        -device x3130-upstream,id=upstream9,bus=pcie.port9 \
        -device xio3130-downstream,id=downstream9_1,bus=upstream9,chassis=4,slot=1 \
        -device vfio-pci,host=0012:03:00.1,bus=downstream9_1,id=nic1,iommufd=iommufd0 \
        -netdev user,id=net0,hostfwd=tcp::5558-:22,hostfwd=tcp::5586-:5586 \

Signed-off-by: Tushar Dave <tdave@nvidia.com>
--