diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c index 05266af3ea0..c99e0013772 100644 --- a/hw/arm/virt-acpi-build.c +++ b/hw/arm/virt-acpi-build.c @@ -49,6 +49,8 @@ #include "hw/cxl/cxl.h" #include "hw/pci/pcie_host.h" #include "hw/pci/pci.h" +#include "hw/vfio/pci.h" +#include "hw/pci/pci_bridge.h" #include "hw/pci/pci_bus.h" #include "hw/pci-host/gpex.h" #include "hw/arm/virt.h" @@ -140,8 +142,595 @@ static void build_acpi0017(Aml *table) aml_append(table, scope); } +typedef struct { + uint64_t addr; + uint64_t end; + uint64_t flags; +} PhysBAR; + +typedef struct { + uint64_t wbase; + uint64_t wlimit; + uint64_t wbase64; + uint64_t wlimit64; + uint64_t rbase; + uint64_t rlimit; + uint64_t rsize; + uint64_t piobase; + bool available; + bool search_mmio64; + PCIDevice *dev; + PCIBus *bus; + struct GPEXConfig *cfg; + bool debug; +} NVIDIACfg; + +#define IORESOURCE_PREFETCH 0x00002000 /* No side effects */ +#define IORESOURCE_MEM_64 0x00100000 + +static void nvidia_get_bridge_window(PCIBus *bus, void *opaque) +{ + PCIDevice *bridge = pci_bridge_get_device(bus); + NVIDIACfg *ncfg = (NVIDIACfg *)opaque; + struct GPEXConfig *cfg = ncfg->cfg; + + if (!bridge) { + ncfg->wbase = cfg->mmio32.base; + ncfg->wlimit = cfg->mmio32.base + cfg->mmio32.size - 1; + ncfg->wbase64 = cfg->mmio64.base; + ncfg->wlimit64 = cfg->mmio64.base + cfg->mmio64.size - 1; + } else { + ncfg->wbase = pci_bridge_get_base(bridge, PCI_BASE_ADDRESS_MEM_TYPE_32); + ncfg->wlimit = pci_bridge_get_limit(bridge, PCI_BASE_ADDRESS_MEM_TYPE_32); + ncfg->wbase64 = pci_bridge_get_base(bridge, PCI_BASE_ADDRESS_MEM_PREFETCH); + ncfg->wlimit64 = pci_bridge_get_limit(bridge, PCI_BASE_ADDRESS_MEM_PREFETCH); + } +} + +static void nvidia_update_bridge_window(PCIBus *bus, uint64_t base, uint64_t limit) +{ + PCIDevice *bridge = pci_bridge_get_device(bus); + uint32_t value0, value1; + + assert(bridge); + + value0 = (uint32_t)(extract64(base, 20, 12) << 4); + value1 = (uint32_t)(extract64(limit, 20, 12) << 4); + pci_host_config_write_common(bridge, + PCI_PREF_MEMORY_BASE, + pci_config_size(bridge), + value0 | PCI_PREF_RANGE_TYPE_64, + 2); + pci_host_config_write_common(bridge, + PCI_PREF_BASE_UPPER32, + pci_config_size(bridge), + (uint32_t)(base >> 32), + 4); + pci_host_config_write_common(bridge, + PCI_PREF_MEMORY_LIMIT, + pci_config_size(bridge), + value1 | PCI_PREF_RANGE_TYPE_64, + 2); + pci_host_config_write_common(bridge, + PCI_PREF_LIMIT_UPPER32, + pci_config_size(bridge), + (uint32_t)(limit >> 32), + 4); +} + +static void fix_pci_bar_GB200_nvidia(PCIDevice *dev, PhysBAR *pbars) +{ + PhysBAR *pbar = pbars; + bool overlap; + int idx; + + for (idx = PCI_ROM_SLOT - 1 ; idx >= 0; idx--) { + if (!(pbar[idx].flags & IORESOURCE_PREFETCH)) + continue; + + pbar[idx].addr = pbars[idx].addr; + pbar[idx].end = pbar[idx].addr + dev->io_regions[idx].size - 1; + } + + /* Make sure BAR1 gets GPA=HPA, adjust other two BARs accordingly to avoind region conflict */ + overlap = true; + while (overlap) { + overlap = false; + + for (idx = 0; idx <= PCI_ROM_SLOT - 1; idx++) { + if (!(pbar[idx].flags & IORESOURCE_PREFETCH)) + continue; + + for (int j = 0; j < PCI_ROM_SLOT; j++) { + if (!(pbar[j].flags & IORESOURCE_PREFETCH) || idx == j) + continue; + + if(ranges_overlap(pbar[idx].addr, dev->io_regions[idx].size, pbar[j].addr, dev->io_regions[j].size)) { + pbar[j].addr = QEMU_ALIGN_UP(pbar[idx].addr + dev->io_regions[idx].size, dev->io_regions[j].size); + overlap = true; + } + } + } + } +} + +static void nvidia_dev_vfio(PCIBus *bus, PCIDevice *dev, void *opaque) +{ + struct GPEXConfig *cfg = (struct GPEXConfig *)opaque; + PhysBAR *pbar, pbars[PCI_ROM_SLOT]; + char *tmp, *resources, line[128]; + int idx, vendor_id, device_id; + VFIOPCIDevice *vdev; + uint32_t laddr; + FILE *fp; + + if (!object_dynamic_cast(OBJECT(dev), TYPE_VFIO_PCI)) { + return; + } + + vdev = VFIO_PCI_BASE(dev); + + tmp = g_strdup_printf("%s/resource", vdev->vbasedev.sysfsdev); + resources = realpath(tmp, NULL); + g_free(tmp); + + idx = 0; + pbar = pbars; + memset(pbar, 0, sizeof(pbars)); + + fp = fopen(resources, "r"); + g_free(resources); + if (!fp) { + return; + } + + do { + if (!fgets(line, sizeof(line), fp)) { + fclose(fp); + return; + } + sscanf(line, "0x%lx 0x%lx 0x%lx\n", &pbar->addr, + &pbar->end, &pbar->flags); + idx++; + pbar++; + } while (*line && idx < PCI_ROM_SLOT); + + fclose(fp); + + vendor_id = pci_get_word(dev->config + PCI_VENDOR_ID); + device_id = pci_get_word(dev->config + PCI_DEVICE_ID); + + /* Nvidia GB200/GB300 workaround */ + if ((vendor_id == 0x10de && device_id == 0x2941) || (vendor_id == 0x10de && device_id == 0x31c2)) { + fix_pci_bar_GB200_nvidia(dev, pbars); + } + + for (idx = 0, pbar = pbars; idx < PCI_ROM_SLOT; idx++, pbar++) { + if (!(pbar->flags & IORESOURCE_PREFETCH)) { + continue; + } + laddr = pbar->addr & PCI_BASE_ADDRESS_MEM_MASK ; + laddr |= PCI_BASE_ADDRESS_MEM_PREFETCH | PCI_BASE_ADDRESS_MEM_TYPE_64; + vfio_pci_write_config(dev, + PCI_BASE_ADDRESS_0 + (idx * 4), + laddr, + 4); + vfio_pci_write_config(dev, + PCI_BASE_ADDRESS_0 + (idx * 4) + 4, + (uint32_t)(pbar->addr >> 32), + 4); + cfg->preserve_config = true; + } +} + +static void nvidia_bus_vfio(PCIBus *bus, void *opaque) +{ + pci_for_each_device_under_bus(bus, nvidia_dev_vfio, opaque); +} + +static void nvidia_mmio64_window(PCIBus *bus, PCIDevice *dev, void *opaque) +{ + NVIDIACfg *ncfg = (NVIDIACfg *)opaque; + uint64_t rbase, rlimit; + uint32_t idx; + + for (idx = 0; idx < PCI_ROM_SLOT; idx++) { + PCIIORegion *res = &dev->io_regions[idx]; + + if ((!res->size) || + ((res->addr < ncfg->wbase64) || (res->addr > ncfg->wlimit64))) { + continue; + } + rbase = res->addr; + rlimit = res->addr + res->size - 1; + ncfg->rbase = MIN(ncfg->rbase, rbase); + ncfg->rlimit = MAX(ncfg->rlimit, rlimit); + } + + if (IS_PCI_BRIDGE(dev)) { + rbase = pci_bridge_get_base(dev, PCI_BASE_ADDRESS_MEM_PREFETCH); + rlimit = pci_bridge_get_limit(dev, PCI_BASE_ADDRESS_MEM_PREFETCH); + + if ((rbase < ncfg->wbase64) || + (rbase > ncfg->wlimit64) || + (rlimit < ncfg->wbase64) || + (rlimit > ncfg->wlimit64)) { + return; + } + + ncfg->rbase = MIN(ncfg->rbase, rbase); + ncfg->rlimit = MAX(ncfg->rlimit, rlimit); + } +} + +static void nvidia_bus_update_bridge_window(PCIBus *bus, void *opaque) +{ + NVIDIACfg *ncfg = (NVIDIACfg *)opaque; + ncfg->rbase = ~0; + ncfg->rlimit = 0; + + assert(pci_bridge_get_device(bus)); + pci_for_each_device_under_bus(bus, nvidia_mmio64_window, ncfg); + + if (ncfg->rlimit > ncfg->rbase) { + nvidia_update_bridge_window(bus, ncfg->rbase, ncfg->rlimit); + } +} + +static void nvidia_dev_rom_max_size(PCIBus *bus, PCIDevice *dev, void *opaque) +{ + NVIDIACfg *ncfg = (NVIDIACfg *)opaque; + uint64_t base, size, rsize = 0; + + size = dev->io_regions[PCI_ROM_SLOT].size; + if (!size) { + return; + } + + base = pci_host_config_read_common(dev, + PCI_ROM_ADDRESS, + pci_config_size(dev), + 4); + base &= ~(size - 1); + if ((base >= ncfg->wbase) && + ((base + size - 1) <= ncfg->wlimit)) { + return; + } + + if (size > rsize) { + ncfg->rsize = size; + ncfg->dev = dev; + } +} + +static void nvidia_find_mmio_helper(PCIBus *bus, PCIDevice *dev, void *opaque) +{ + NVIDIACfg *ncfg = (NVIDIACfg *)opaque; + uint64_t base, limit, wbase, wlimit; + uint32_t idx; + PCIIORegion *res; + + if (ncfg->search_mmio64) { + wbase = ncfg->wbase64; + wlimit = ncfg->wlimit64; + } else { + wbase = ncfg->wbase; + wlimit = ncfg->wlimit; + } + + for (idx = 0; idx < PCI_NUM_REGIONS; idx++) { + res = &dev->io_regions[idx]; + if ((!res->size) || (res->type & PCI_BASE_ADDRESS_SPACE_IO)) { + continue; + } + + if (ncfg->search_mmio64) { + if ((!(res->type & PCI_BASE_ADDRESS_MEM_TYPE_64)) || + (!(res->type & PCI_BASE_ADDRESS_MEM_PREFETCH))) { + continue; + } + } + + if (idx == PCI_ROM_SLOT) { + base = pci_host_config_read_common(dev, + PCI_ROM_ADDRESS, + pci_config_size(dev), + 4); + } else { + base = res->addr; + } + + base &= ~(res->size - 1); + if ((base < wbase) || ((base + res->size - 1) > wlimit)) { + continue; + } + + if (ranges_overlap(ncfg->rbase, ncfg->rsize, base, res->size)) { + ncfg->rbase = QEMU_ALIGN_UP(base + res->size, ncfg->rsize); + ncfg->rlimit = ncfg->rbase + ncfg->rsize - 1; + ncfg->available = false; + } + } + + if (IS_PCI_BRIDGE(dev)) { + + if (ncfg->search_mmio64) { + base = pci_bridge_get_base(dev, PCI_BASE_ADDRESS_MEM_PREFETCH); + limit = pci_bridge_get_limit(dev, PCI_BASE_ADDRESS_MEM_PREFETCH); + } else { + base = pci_bridge_get_base(dev, PCI_BASE_ADDRESS_MEM_TYPE_32); + limit = pci_bridge_get_limit(dev, PCI_BASE_ADDRESS_MEM_TYPE_32); + } + + if ((base < wbase) || (limit > wlimit)) { + return; + } + + if (ranges_overlap(ncfg->rbase, ncfg->rsize, base, limit - base + 1)) { + ncfg->rbase = QEMU_ALIGN_UP(limit + 1, ncfg->rsize); + ncfg->rlimit = ncfg->rbase + ncfg->rsize - 1; + ncfg->available = false; + } + } +} + +static bool nvidia_find_mmio(PCIBus *bus, NVIDIACfg *ncfg) +{ + uint64_t wlimit; + + if (ncfg->search_mmio64) { + ncfg->rbase = ncfg->wbase64; + wlimit = ncfg->wlimit64; + } else { + ncfg->rbase = ncfg->wbase; + wlimit = ncfg->wlimit; + } + ncfg->rlimit = ncfg->rbase + ncfg->rsize - 1; + + while (ncfg->rlimit <= wlimit) { + ncfg->available = true; + pci_for_each_device_under_bus(bus, nvidia_find_mmio_helper, ncfg); + if (ncfg->available) { + return true; + } + } + return false; +} + +static void nvidia_bus_adjust_mmio32_rom(PCIBus *bus, void *opaque) +{ + NVIDIACfg *ncfg = (NVIDIACfg *)opaque; + + ncfg->search_mmio64 = false; + nvidia_get_bridge_window(bus, ncfg); + + do { + ncfg->rsize = 0; + pci_for_each_device_under_bus(bus, nvidia_dev_rom_max_size, ncfg); + if (!ncfg->rsize) + break; + if (nvidia_find_mmio(bus, ncfg)) { + pci_host_config_write_common(ncfg->dev, + PCI_ROM_ADDRESS, + pci_config_size(ncfg->dev), + ncfg->rbase, + 4); + } + } while (true); +} + + +static void nvidia_dev_shift_mmio64(PCIBus *bus, PCIDevice *dev, void *opaque) +{ + NVIDIACfg *ncfg = (NVIDIACfg *)opaque; + uint64_t addr; + uint32_t idx; + + for (idx = 0; idx < PCI_ROM_SLOT; idx++) { + PCIIORegion *res = &dev->io_regions[idx]; + + if ((!res->size) || + (!(res->type & PCI_BASE_ADDRESS_MEM_TYPE_64)) || + (!(res->type & PCI_BASE_ADDRESS_MEM_PREFETCH))) { + continue; + } + + addr = res->addr & PCI_BASE_ADDRESS_MEM_MASK; + if ((addr >= ncfg->wbase64) && (addr <= ncfg->wlimit64)) { + continue; + } + + addr += ncfg->rbase; + addr |= PCI_BASE_ADDRESS_MEM_PREFETCH | PCI_BASE_ADDRESS_MEM_TYPE_64; + + pci_host_config_write_common(dev, + PCI_BASE_ADDRESS_0 + (idx * 4), + pci_config_size(dev), + (uint32_t)(addr & 0xffffffff), + 4); + pci_host_config_write_common(dev, + PCI_BASE_ADDRESS_0 + (idx * 4) + 4, + pci_config_size(dev), + (uint32_t)(addr >> 32), + 4); + } +} + +static void nvidia_dev_unassigned_mmio64(PCIBus *bus, PCIDevice *dev, void *opaque) +{ + NVIDIACfg *ncfg0 = (NVIDIACfg *)opaque; + struct GPEXConfig *cfg = ncfg0->cfg; + NVIDIACfg ncfg1, *ncfg = &ncfg1; + uint64_t base, limit; + PCIBus *sbus; + + if (!IS_PCI_BRIDGE(dev)) { + return; + } + + sbus = &PCI_BRIDGE(dev)->sec_bus; + memcpy(ncfg, ncfg0, sizeof(NVIDIACfg)); + base = pci_bridge_get_base(dev, PCI_BASE_ADDRESS_MEM_PREFETCH); + limit = pci_bridge_get_limit(dev, PCI_BASE_ADDRESS_MEM_PREFETCH); + + if ((base >= ncfg->wbase64) && + (base <= ncfg->wlimit64) && + (limit >= ncfg->wbase64) && + (limit <= ncfg->wlimit64)) { + return; + } + + ncfg->rsize = base >= limit ? 0x100000 : limit - base + 1; + ncfg->search_mmio64 = true; + nvidia_get_bridge_window(bus, ncfg); + + /* Check if the required space is free in the parent bus */ + if (!nvidia_find_mmio(bus, ncfg)) { + + /* Try with the extended parent window */ + ncfg->rbase = QEMU_ALIGN_UP(ncfg->wlimit64 + 1, ncfg->rsize); + ncfg->wlimit64 = ncfg->rbase + ncfg->rsize - 1; + /* TODO: check conflicts with the extended window */ + } + + if (base >= limit) { + nvidia_update_bridge_window(sbus, ncfg->rbase, ncfg->rlimit); + } else { + ncfg->rbase -= base; + pci_for_each_device_under_bus(sbus, nvidia_dev_shift_mmio64, ncfg); + } + + ncfg->wbase64 = cfg->mmio64.base + cfg->mmio64.size / 2; + ncfg->wlimit64 = ncfg->wbase64 + (cfg->mmio64.size / 2) - 1; + pci_for_each_bus(ncfg->bus, nvidia_bus_update_bridge_window, ncfg); +} + +static void nvidia_bus_unassigned_mmio64(PCIBus *bus, void *opaque) +{ + pci_for_each_device_under_bus(bus, nvidia_dev_unassigned_mmio64, opaque); +} + +static void nvidia_dev_assign_pio(PCIBus *bus, PCIDevice *dev, void *opaque) +{ + NVIDIACfg *ncfg = (NVIDIACfg *)opaque; + struct GPEXConfig *cfg = ncfg->cfg; + PCIIORegion *res; + uint32_t idx; + + for (idx = 0; idx < PCI_NUM_REGIONS; idx++) { + res = &dev->io_regions[idx]; + + if ((!res->size) || (!(res->type & PCI_BASE_ADDRESS_SPACE_IO))) { + continue; + } + ncfg->piobase = QEMU_ALIGN_UP(ncfg->piobase, res->size); + pci_host_config_write_common(dev, + PCI_BASE_ADDRESS_0 + (idx * 4), + pci_config_size(dev), + (uint32_t)(ncfg->piobase - cfg->pio.base), + 4); + ncfg->piobase += res->size; + } +} + +static void nvidia_pio_window(PCIBus *bus, PCIDevice *dev, void *opaque) +{ + NVIDIACfg *ncfg = (NVIDIACfg *)opaque; + uint64_t rbase, rlimit; + uint32_t idx; + + for (idx = 0; idx < PCI_ROM_SLOT; idx++) { + PCIIORegion *res = &dev->io_regions[idx]; + + if ((!res->size) || (!(res->type & PCI_BASE_ADDRESS_SPACE_IO))) { + continue; + } + + rbase = res->addr; + rlimit = res->addr + res->size - 1; + ncfg->rbase = MIN(ncfg->rbase, rbase); + ncfg->rlimit = MAX(ncfg->rlimit, rlimit); + } + + if (IS_PCI_BRIDGE(dev)) { + rbase = pci_bridge_get_base(dev, PCI_BASE_ADDRESS_SPACE_IO); + rlimit = pci_bridge_get_limit(dev, PCI_BASE_ADDRESS_SPACE_IO); + + ncfg->rbase = MIN(ncfg->rbase, rbase); + ncfg->rlimit = MAX(ncfg->rlimit, rlimit); + } +} + +static void nvidia_bus_assign_pio(PCIBus *bus, void *opaque) +{ + PCIDevice *bridge = pci_bridge_get_device(bus); + NVIDIACfg *ncfg = (NVIDIACfg *)opaque; + uint32_t value0, value1; + + ncfg->piobase = QEMU_ALIGN_UP(ncfg->piobase, 0x1000); + pci_for_each_device_under_bus(bus, nvidia_dev_assign_pio, ncfg); + if (!bridge) { + return; + } + + ncfg->rbase = ~0; + ncfg->rlimit = 0; + pci_for_each_device_under_bus(bus, nvidia_pio_window, ncfg); + + if (ncfg->rbase > ncfg->rlimit) { + ncfg->rbase = QEMU_ALIGN_UP(ncfg->piobase, 0x1000); + ncfg->piobase += 0x1000; + ncfg->rlimit = ncfg->piobase - 1; + } + + value0 = (uint32_t)(extract64(ncfg->rbase, 12, 4) << 4); + value1 = (uint32_t)(extract64(ncfg->rlimit, 12, 4) << 4); + + pci_host_config_write_common(bridge, + PCI_IO_BASE, + pci_config_size(bridge), + value0 | PCI_IO_RANGE_TYPE_16, + 1); + pci_host_config_write_common(bridge, + PCI_IO_LIMIT, + pci_config_size(bridge), + value1 | PCI_IO_RANGE_TYPE_16, + 1); +} + +static void nvidia_prepare_mmio64_identity(struct GPEXConfig *cfg) +{ + NVIDIACfg ncfg1, *ncfg = &ncfg1; + PCIBus *bus = cfg->bus; + + pci_for_each_bus(bus, nvidia_bus_vfio, cfg); + if (!cfg->preserve_config) { + return; + } + + memset(ncfg, 0, sizeof(NVIDIACfg)); + ncfg->cfg = cfg; + + nvidia_get_bridge_window(bus, ncfg); + pci_for_each_bus(bus, nvidia_bus_adjust_mmio32_rom, ncfg); + + ncfg->piobase = cfg->pio.base; + pci_for_each_bus(bus, nvidia_bus_assign_pio, ncfg); + + nvidia_get_bridge_window(bus, ncfg); + + QLIST_FOREACH(bus, &bus->child, sibling) { + ncfg->bus = bus; + ncfg->wbase64 = cfg->mmio64.base + cfg->mmio64.size / 2; + ncfg->wlimit64 = ncfg->wbase64 + (cfg->mmio64.size / 2) - 1; + + pci_for_each_bus(bus, nvidia_bus_update_bridge_window, ncfg); + pci_for_each_bus(bus, nvidia_bus_unassigned_mmio64, ncfg); + } +} + static void acpi_dsdt_add_pci(Aml *scope, const MemMapEntry *memmap, - uint32_t irq, VirtMachineState *vms) + uint32_t irq, VirtMachineState *vms, bool update) { int ecam_id = VIRT_ECAM_ID(vms->highmem_ecam); bool cxl_present = false; @@ -173,6 +762,10 @@ static void acpi_dsdt_add_pci(Aml *scope, const MemMapEntry *memmap, if (vms->highmem_mmio) { cfg.mmio64 = memmap[VIRT_HIGH_PCIE_MMIO]; + + if (vms->grace_pcie_mmio_identity && update) { + nvidia_prepare_mmio64_identity(&cfg); + } } acpi_dsdt_add_gpex(scope, &cfg); @@ -1143,7 +1736,7 @@ static int acpi_dsdt_add_cmdqv(Aml *scope, GArray *smmuv3_devs) /* DSDT */ static void -build_dsdt(GArray *table_data, AcpiBuildTables *tables, VirtMachineState *vms) +build_dsdt(GArray *table_data, AcpiBuildTables *tables, VirtMachineState *vms, bool update) { VirtMachineClass *vmc = VIRT_MACHINE_GET_CLASS(vms); GArray *smmuv3_devs = tables->smmuv3_devs; @@ -1179,7 +1772,7 @@ build_dsdt(GArray *table_data, AcpiBuildTables *tables, VirtMachineState *vms) virtio_acpi_dsdt_add(scope, memmap[VIRT_MMIO].base, memmap[VIRT_MMIO].size, (irqmap[VIRT_MMIO] + ARM_SPI_BASE), 0, NUM_VIRTIO_TRANSPORTS); - acpi_dsdt_add_pci(scope, memmap, irqmap[VIRT_PCIE] + ARM_SPI_BASE, vms); + acpi_dsdt_add_pci(scope, memmap, irqmap[VIRT_PCIE] + ARM_SPI_BASE, vms, update); if (vms->acpi_dev) { build_ged_aml(scope, "\\_SB."GED_DEVICE, HOTPLUG_HANDLER(vms->acpi_dev), @@ -1274,7 +1867,7 @@ static void virt_acpi_prebuild(VirtMachineState *vms, AcpiBuildTables *tables) } static -void virt_acpi_build(VirtMachineState *vms, AcpiBuildTables *tables) +void virt_acpi_build(VirtMachineState *vms, AcpiBuildTables *tables, bool update) { VirtMachineClass *vmc = VIRT_MACHINE_GET_CLASS(vms); GArray *table_offsets; @@ -1292,7 +1885,7 @@ void virt_acpi_build(VirtMachineState *vms, AcpiBuildTables *tables) /* DSDT is pointed to by FADT */ dsdt = tables_blob->len; - build_dsdt(tables_blob, tables, vms); + build_dsdt(tables_blob, tables, vms, update); /* FADT MADT PPTT GTDT MCFG SPCR DBG2 pointed to by RSDT */ acpi_add_table(table_offsets, tables_blob); @@ -1438,7 +2031,7 @@ static void virt_acpi_build_update(void *build_opaque) acpi_build_tables_init(&tables); - virt_acpi_build(VIRT_MACHINE(qdev_get_machine()), &tables); + virt_acpi_build(VIRT_MACHINE(qdev_get_machine()), &tables, true); acpi_ram_update(build_state->table_mr, tables.table_data); acpi_ram_update(build_state->rsdp_mr, tables.rsdp); @@ -1482,7 +2075,7 @@ void virt_acpi_setup(VirtMachineState *vms) build_state = g_malloc0(sizeof *build_state); acpi_build_tables_init(&tables); - virt_acpi_build(vms, &tables); + virt_acpi_build(vms, &tables, false); /* Now expose it all to Guest */ build_state->table_mr = acpi_add_rom_blob(virt_acpi_build_update, diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 418ed77debf..8bced5ea60c 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -1693,6 +1693,9 @@ static void create_pcie(VirtMachineState *vms) 2, base_ecam, 2, size_ecam); if (vms->highmem_mmio) { + if (vms->grace_pcie_mmio_identity && virt_is_acpi_enabled(vms)) { + size_mmio_high = size_mmio_high >> 1; + } qemu_fdt_setprop_sized_cells(ms->fdt, nodename, "ranges", 1, FDT_PCI_RANGE_IOPORT, 2, 0, 2, base_pio, 2, size_pio, @@ -1955,6 +1958,12 @@ static void virt_set_high_memmap(VirtMachineState *vms, vms->highest_gpa = base - 1; } } + + if (vms->grace_pcie_mmio_identity) { + vms->highest_gpa = BIT_ULL(pa_bits) - 1; + vms->memmap[VIRT_HIGH_PCIE_MMIO].base = 0x400000000000; + vms->memmap[VIRT_HIGH_PCIE_MMIO].size = 0x400000000000; + } } static void virt_set_memmap(VirtMachineState *vms, int pa_bits) @@ -2793,6 +2802,20 @@ static void virt_set_oem_table_id(Object *obj, const char *value, strncpy(vms->oem_table_id, value, 8); } +static bool virt_get_grace_pcie_mmio_identity(Object *obj, Error **errp) +{ + VirtMachineState *vms = VIRT_MACHINE(obj); + + return vms->grace_pcie_mmio_identity; +} + +static void virt_set_grace_pcie_mmio_identity(Object *obj, bool value, + Error **errp) +{ + VirtMachineState *vms = VIRT_MACHINE(obj); + + vms->grace_pcie_mmio_identity = value; +} bool virt_is_acpi_enabled(VirtMachineState *vms) { @@ -3498,6 +3521,14 @@ static void virt_machine_class_init(ObjectClass *oc, const void *data) "in ACPI table header." "The string may be up to 8 bytes in size"); + object_class_property_add_bool(oc, "grace-pcie-mmio-identity", + virt_get_grace_pcie_mmio_identity, + virt_set_grace_pcie_mmio_identity); + object_class_property_set_description(oc, "grace-pcie-mmio-identity", + "Set on/off to enable/disable " + "mapping PCIe 64bit BARs with " + "HPA = IPA for pass-through devices"); + } static void virt_instance_init(Object *obj) diff --git a/hw/pci-bridge/pcie_root_port.c b/hw/pci-bridge/pcie_root_port.c index 22c2fdb71e7..acc3f8d89a1 100644 --- a/hw/pci-bridge/pcie_root_port.c +++ b/hw/pci-bridge/pcie_root_port.c @@ -46,6 +46,7 @@ static void rp_write_config(PCIDevice *d, uint32_t address, static void rp_reset_hold(Object *obj, ResetType type) { PCIDevice *d = PCI_DEVICE(obj); + PCIEPort *p = PCIE_PORT(d); DeviceState *qdev = DEVICE(obj); rp_aer_vector_update(d); @@ -53,7 +54,7 @@ static void rp_reset_hold(Object *obj, ResetType type) pcie_cap_deverr_reset(d); pcie_cap_slot_reset(d); pcie_cap_arifwd_reset(d); - pcie_acs_reset(d); + pcie_acs_reset(d, p->acs_caps); pcie_aer_root_reset(d); pci_bridge_reset(qdev); pci_bridge_disable_base_limit(d); @@ -118,7 +119,10 @@ static void rp_realize(PCIDevice *d, Error **errp) rp_aer_vector_update(d); if (rpc->acs_offset && !s->disable_acs) { - pcie_acs_init(d, rpc->acs_offset); + rc = pcie_acs_init(d, rpc->acs_offset, p->acs_caps, errp); + if (rc < 0) { + goto err; + } } return; @@ -152,6 +156,8 @@ static const Property rp_props[] = { DEFINE_PROP_BIT(COMPAT_PROP_PCP, PCIDevice, cap_present, QEMU_PCIE_SLTCAP_PCP_BITNR, true), DEFINE_PROP_BOOL("disable-acs", PCIESlot, disable_acs, false), + DEFINE_PROP_UINT16("acs-caps", PCIEPort, + acs_caps, 0), }; static void rp_instance_post_init(Object *obj) diff --git a/hw/pci-bridge/xio3130_downstream.c b/hw/pci-bridge/xio3130_downstream.c index dc7d1aa7d77..9674c06c0b7 100644 --- a/hw/pci-bridge/xio3130_downstream.c +++ b/hw/pci-bridge/xio3130_downstream.c @@ -40,6 +40,8 @@ #define XIO3130_SSVID_SSID 0 #define XIO3130_EXP_OFFSET 0x90 #define XIO3130_AER_OFFSET 0x100 +#define XIO3130_ACS_OFFSET \ + (XIO3130_AER_OFFSET + PCI_ERR_SIZEOF) static void xio3130_downstream_write_config(PCIDevice *d, uint32_t address, uint32_t val, int len) @@ -56,10 +58,12 @@ static void xio3130_downstream_write_config(PCIDevice *d, uint32_t address, static void xio3130_downstream_reset(DeviceState *qdev) { PCIDevice *d = PCI_DEVICE(qdev); + PCIEPort *p = PCIE_PORT(d); pcie_cap_deverr_reset(d); pcie_cap_slot_reset(d); pcie_cap_arifwd_reset(d); + pcie_acs_reset(d, p->acs_caps); pci_bridge_reset(qdev); } @@ -111,6 +115,11 @@ static void xio3130_downstream_realize(PCIDevice *d, Error **errp) goto err; } + rc = pcie_acs_init(d, XIO3130_ACS_OFFSET, p->acs_caps, errp); + if (rc < 0) { + goto err; + } + return; err: @@ -137,6 +146,7 @@ static void xio3130_downstream_exitfn(PCIDevice *d) static const Property xio3130_downstream_props[] = { DEFINE_PROP_BIT(COMPAT_PROP_PCP, PCIDevice, cap_present, QEMU_PCIE_SLTCAP_PCP_BITNR, true), + DEFINE_PROP_UINT16("acs-caps", PCIEPort, acs_caps, 0), }; static const VMStateDescription vmstate_xio3130_downstream = { diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c index eaeb68894e6..738eb2e24d4 100644 --- a/hw/pci/pcie.c +++ b/hw/pci/pcie.c @@ -1178,10 +1178,11 @@ void pcie_ats_init(PCIDevice *dev, uint16_t offset, bool aligned) } /* ACS (Access Control Services) */ -void pcie_acs_init(PCIDevice *dev, uint16_t offset) +int pcie_acs_init(PCIDevice *dev, uint16_t offset, uint16_t ctrl_bits, Error **errp) { bool is_downstream = pci_is_express_downstream_port(dev); uint16_t cap_bits = 0; + PCIEPort *p = PCIE_PORT(dev); /* For endpoints, only multifunction devs may have an ACS capability: */ assert(is_downstream || @@ -1202,16 +1203,33 @@ void pcie_acs_init(PCIDevice *dev, uint16_t offset) */ cap_bits = PCI_ACS_SV | PCI_ACS_TB | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF | PCI_ACS_DT; + + if (ctrl_bits & ~cap_bits) { + error_setg(errp, "Unsupported ACS capabilities 0x%hx were supplied. " + "Supported capabilities are 0x%hx", ctrl_bits & ~cap_bits, + cap_bits); + return -EINVAL; + } } pci_set_word(dev->config + offset + PCI_ACS_CAP, cap_bits); - pci_set_word(dev->wmask + offset + PCI_ACS_CTRL, cap_bits); + + if (is_downstream && p->acs_caps) { + /* Block guest writes to ACS Control entirely to preserve QEMU ACS settings */ + pci_set_word(dev->wmask + offset + PCI_ACS_CTRL, 0); + } else { + pci_set_word(dev->wmask + offset + PCI_ACS_CTRL, cap_bits); + } + + pci_set_word(dev->config + offset + PCI_ACS_CTRL, ctrl_bits); + + return 0; } -void pcie_acs_reset(PCIDevice *dev) +void pcie_acs_reset(PCIDevice *dev, uint16_t val) { if (dev->exp.acs_cap) { - pci_set_word(dev->config + dev->exp.acs_cap + PCI_ACS_CTRL, 0); + pci_set_word(dev->config + dev->exp.acs_cap + PCI_ACS_CTRL, val); } } diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h index 0963356fc26..6f375e1b018 100644 --- a/include/hw/arm/virt.h +++ b/include/hw/arm/virt.h @@ -182,6 +182,7 @@ struct VirtMachineState { CXLState cxl_devices_state; bool legacy_smmuv3_present; bool pci_preserve_config; + bool grace_pcie_mmio_identity; }; #define VIRT_ECAM_ID(high) (high ? VIRT_HIGH_PCIE_ECAM : VIRT_PCIE_ECAM) diff --git a/include/hw/pci/pcie.h b/include/hw/pci/pcie.h index ff6ce08e135..fde4942feb8 100644 --- a/include/hw/pci/pcie.h +++ b/include/hw/pci/pcie.h @@ -135,8 +135,8 @@ void pcie_add_capability(PCIDevice *dev, uint16_t offset, uint16_t size); void pcie_sync_bridge_lnk(PCIDevice *dev); -void pcie_acs_init(PCIDevice *dev, uint16_t offset); -void pcie_acs_reset(PCIDevice *dev); +int pcie_acs_init(PCIDevice *dev, uint16_t offset, uint16_t ctrl_bits, Error **errp); +void pcie_acs_reset(PCIDevice *dev, uint16_t val); void pcie_ari_init(PCIDevice *dev, uint16_t offset); void pcie_dev_ser_num_init(PCIDevice *dev, uint16_t offset, uint64_t ser_num); diff --git a/include/hw/pci/pcie_port.h b/include/hw/pci/pcie_port.h index 7cd7af8cfa4..4cb40f3226e 100644 --- a/include/hw/pci/pcie_port.h +++ b/include/hw/pci/pcie_port.h @@ -36,6 +36,7 @@ struct PCIEPort { /* pci express switch port */ uint8_t port; + uint16_t acs_caps; }; void pcie_port_init_reg(PCIDevice *d);