Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
607 changes: 600 additions & 7 deletions hw/arm/virt-acpi-build.c

Large diffs are not rendered by default.

31 changes: 31 additions & 0 deletions hw/arm/virt.c
Original file line number Diff line number Diff line change
Expand Up @@ -1693,6 +1693,9 @@ static void create_pcie(VirtMachineState *vms)
2, base_ecam, 2, size_ecam);

if (vms->highmem_mmio) {
if (vms->grace_pcie_mmio_identity && virt_is_acpi_enabled(vms)) {
size_mmio_high = size_mmio_high >> 1;
}
qemu_fdt_setprop_sized_cells(ms->fdt, nodename, "ranges",
1, FDT_PCI_RANGE_IOPORT, 2, 0,
2, base_pio, 2, size_pio,
Expand Down Expand Up @@ -1955,6 +1958,12 @@ static void virt_set_high_memmap(VirtMachineState *vms,
vms->highest_gpa = base - 1;
}
}

if (vms->grace_pcie_mmio_identity) {
vms->highest_gpa = BIT_ULL(pa_bits) - 1;
vms->memmap[VIRT_HIGH_PCIE_MMIO].base = 0x400000000000;
vms->memmap[VIRT_HIGH_PCIE_MMIO].size = 0x400000000000;
}
}

static void virt_set_memmap(VirtMachineState *vms, int pa_bits)
Expand Down Expand Up @@ -2793,6 +2802,20 @@ static void virt_set_oem_table_id(Object *obj, const char *value,
strncpy(vms->oem_table_id, value, 8);
}

static bool virt_get_grace_pcie_mmio_identity(Object *obj, Error **errp)
{
VirtMachineState *vms = VIRT_MACHINE(obj);

return vms->grace_pcie_mmio_identity;
}

static void virt_set_grace_pcie_mmio_identity(Object *obj, bool value,
Error **errp)
{
VirtMachineState *vms = VIRT_MACHINE(obj);

vms->grace_pcie_mmio_identity = value;
}

bool virt_is_acpi_enabled(VirtMachineState *vms)
{
Expand Down Expand Up @@ -3498,6 +3521,14 @@ static void virt_machine_class_init(ObjectClass *oc, const void *data)
"in ACPI table header."
"The string may be up to 8 bytes in size");

object_class_property_add_bool(oc, "grace-pcie-mmio-identity",
virt_get_grace_pcie_mmio_identity,
virt_set_grace_pcie_mmio_identity);
object_class_property_set_description(oc, "grace-pcie-mmio-identity",
"Set on/off to enable/disable "
"mapping PCIe 64bit BARs with "
"HPA = IPA for pass-through devices");

}

static void virt_instance_init(Object *obj)
Expand Down
10 changes: 8 additions & 2 deletions hw/pci-bridge/pcie_root_port.c
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,15 @@ static void rp_write_config(PCIDevice *d, uint32_t address,
static void rp_reset_hold(Object *obj, ResetType type)
{
PCIDevice *d = PCI_DEVICE(obj);
PCIEPort *p = PCIE_PORT(d);
DeviceState *qdev = DEVICE(obj);

rp_aer_vector_update(d);
pcie_cap_root_reset(d);
pcie_cap_deverr_reset(d);
pcie_cap_slot_reset(d);
pcie_cap_arifwd_reset(d);
pcie_acs_reset(d);
pcie_acs_reset(d, p->acs_caps);
pcie_aer_root_reset(d);
pci_bridge_reset(qdev);
pci_bridge_disable_base_limit(d);
Expand Down Expand Up @@ -118,7 +119,10 @@ static void rp_realize(PCIDevice *d, Error **errp)
rp_aer_vector_update(d);

if (rpc->acs_offset && !s->disable_acs) {
pcie_acs_init(d, rpc->acs_offset);
rc = pcie_acs_init(d, rpc->acs_offset, p->acs_caps, errp);
if (rc < 0) {
goto err;
}
}
return;

Expand Down Expand Up @@ -152,6 +156,8 @@ static const Property rp_props[] = {
DEFINE_PROP_BIT(COMPAT_PROP_PCP, PCIDevice, cap_present,
QEMU_PCIE_SLTCAP_PCP_BITNR, true),
DEFINE_PROP_BOOL("disable-acs", PCIESlot, disable_acs, false),
DEFINE_PROP_UINT16("acs-caps", PCIEPort,
acs_caps, 0),
};

static void rp_instance_post_init(Object *obj)
Expand Down
10 changes: 10 additions & 0 deletions hw/pci-bridge/xio3130_downstream.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@
#define XIO3130_SSVID_SSID 0
#define XIO3130_EXP_OFFSET 0x90
#define XIO3130_AER_OFFSET 0x100
#define XIO3130_ACS_OFFSET \
(XIO3130_AER_OFFSET + PCI_ERR_SIZEOF)

static void xio3130_downstream_write_config(PCIDevice *d, uint32_t address,
uint32_t val, int len)
Expand All @@ -56,10 +58,12 @@ static void xio3130_downstream_write_config(PCIDevice *d, uint32_t address,
static void xio3130_downstream_reset(DeviceState *qdev)
{
PCIDevice *d = PCI_DEVICE(qdev);
PCIEPort *p = PCIE_PORT(d);

pcie_cap_deverr_reset(d);
pcie_cap_slot_reset(d);
pcie_cap_arifwd_reset(d);
pcie_acs_reset(d, p->acs_caps);
pci_bridge_reset(qdev);
}

Expand Down Expand Up @@ -111,6 +115,11 @@ static void xio3130_downstream_realize(PCIDevice *d, Error **errp)
goto err;
}

rc = pcie_acs_init(d, XIO3130_ACS_OFFSET, p->acs_caps, errp);
if (rc < 0) {
goto err;
}

return;

err:
Expand All @@ -137,6 +146,7 @@ static void xio3130_downstream_exitfn(PCIDevice *d)
static const Property xio3130_downstream_props[] = {
DEFINE_PROP_BIT(COMPAT_PROP_PCP, PCIDevice, cap_present,
QEMU_PCIE_SLTCAP_PCP_BITNR, true),
DEFINE_PROP_UINT16("acs-caps", PCIEPort, acs_caps, 0),
};

static const VMStateDescription vmstate_xio3130_downstream = {
Expand Down
26 changes: 22 additions & 4 deletions hw/pci/pcie.c
Original file line number Diff line number Diff line change
Expand Up @@ -1178,10 +1178,11 @@ void pcie_ats_init(PCIDevice *dev, uint16_t offset, bool aligned)
}

/* ACS (Access Control Services) */
void pcie_acs_init(PCIDevice *dev, uint16_t offset)
int pcie_acs_init(PCIDevice *dev, uint16_t offset, uint16_t ctrl_bits, Error **errp)
{
bool is_downstream = pci_is_express_downstream_port(dev);
uint16_t cap_bits = 0;
PCIEPort *p = PCIE_PORT(dev);

/* For endpoints, only multifunction devs may have an ACS capability: */
assert(is_downstream ||
Expand All @@ -1202,16 +1203,33 @@ void pcie_acs_init(PCIDevice *dev, uint16_t offset)
*/
cap_bits = PCI_ACS_SV | PCI_ACS_TB | PCI_ACS_RR |
PCI_ACS_CR | PCI_ACS_UF | PCI_ACS_DT;

if (ctrl_bits & ~cap_bits) {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

capability check LGTM

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for your thorough review :)

error_setg(errp, "Unsupported ACS capabilities 0x%hx were supplied. "
"Supported capabilities are 0x%hx", ctrl_bits & ~cap_bits,
cap_bits);
return -EINVAL;
}
}

pci_set_word(dev->config + offset + PCI_ACS_CAP, cap_bits);
pci_set_word(dev->wmask + offset + PCI_ACS_CTRL, cap_bits);

if (is_downstream && p->acs_caps) {
/* Block guest writes to ACS Control entirely to preserve QEMU ACS settings */

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mainly for my understanding: If this is only needed for GPUDirect RDMA, is it necessary to make this change for all PCI devices (as opposed to just NICs and GPUs?)

I'd also like to have a better understanding of whether you think this kind of change might cause any unexpected end-user-visible behavior when doing passthrough operations, which might manifest differently than if they were doing the same operation on a different platform with mainline qemu. (If so, can you link to where this divergent behavior is documented somewhere in your end-user instructions?)

Also, making the ACS read-only to the guest seems like something that might be useful to other hardware, and also might be nice to have as an explicit qemu configuration option rather than being implemented in this function - are there plans to make this a more general configurable in that future upstream PR?

Copy link
Author

@tdavenvidia tdavenvidia Jan 8, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mainly for my understanding: If this is only needed for GPUDirect RDMA, is it necessary to make this change for all PCI devices (as opposed to just NICs and GPUs?)

We are adding 'acs_caps' as a generic option, right? ACS is per device primary applicable RP and Downstream Ports , and so 'acs_caps'.

I'd also like to have a better understanding of whether you think this kind of change might cause any unexpected end-user-visible behavior when doing passthrough operations,

Admin who is launching QEMU and using ACS should know what they are doing. Like kernel which allows to use 'config_acs' kernel parameter , acs_caps serves the same purpose, for example in this case to allow p2p between pcie devices e.g. GPU and NIC.

Also, making the ACS read-only to the guest seems like something that might be useful to other hardware, and also might be nice to have as an explicit qemu configuration option rather than being implemented in this function - are there plans to make this a more general configurable in that future upstream PR?

The acs_caps is generic option for all PCIe.

FWIW, here is the Testing command for GB200/GB300 GPUDirect RDMA using Nvidia GPU, CX8 and Data Direct Interface:

qemu-system-aarch64 \
          -object iommufd,id=iommufd0 \
          -machine hmat=on -machine virt,accel=kvm,gic-version=3,ras=on,grace-pcie-mmio-identity=on,highmem-mmio-size=4T \
          -cpu host -smp cpus=16 -m size=16G,slots=2,maxmem=256G -nographic \
          -object memory-backend-ram,size=8G,id=m0 \
          -object memory-backend-ram,size=8G,id=m1 \
          -numa node,memdev=m0,cpus=0-15,nodeid=0 -numa node,memdev=m1,nodeid=1 \
          -numa node,nodeid=2 -numa node,nodeid=3 -numa node,nodeid=4 -numa node,nodeid=5\
          -numa node,nodeid=6 -numa node,nodeid=7 -numa node,nodeid=8 -numa node,nodeid=9\
          -device pxb-pcie,id=pcie.1,bus_nr=1,bus=pcie.0 \
          -device arm-smmuv3,primary-bus=pcie.1,id=smmuv3.1,accel=on,ats=on,ril=off,pasid=on,oas=48,cmdqv=on \
          -device pcie-root-port,id=pcie.port1,bus=pcie.1,chassis=1,io-reserve=0,acs-caps=0x1C \
          -device x3130-upstream,id=upstream1,bus=pcie.port1 \
          -device xio3130-downstream,id=downstream1_1,bus=upstream1,chassis=1,slot=1,acs-caps=0x19 \
          -device vfio-pci,host=0018:03:00.0,bus=downstream1_1,id=dmapf1,iommufd=iommufd0 \
          -device xio3130-downstream,id=downstream1_2,bus=upstream1,chassis=1,slot=2,acs-caps=0x15 \
          -device vfio-pci-nohotplug,host=0018:06:00.0,bus=downstream1_2,rombar=0,id=dev0,iommufd=iommufd0 \
          -object acpi-generic-initiator,id=gi0,pci-dev=dev0,node=2 \
          -object acpi-generic-initiator,id=gi1,pci-dev=dev0,node=3 \
          -object acpi-generic-initiator,id=gi2,pci-dev=dev0,node=4 \
          -object acpi-generic-initiator,id=gi3,pci-dev=dev0,node=5 \
          -object acpi-generic-initiator,id=gi4,pci-dev=dev0,node=6 \
          -object acpi-generic-initiator,id=gi5,pci-dev=dev0,node=7 \
          -object acpi-generic-initiator,id=gi6,pci-dev=dev0,node=8 \
          -object acpi-generic-initiator,id=gi7,pci-dev=dev0,node=9 \
          -bios /usr/share/AAVMF/AAVMF_CODE.fd \
          -device nvme,drive=nvme0,serial=deadbeaf1,bus=pcie.0 \
          -drive file=<IMG.qcow2>,index=0,media=disk,format=qcow2,if=none,id=nvme0 \
          -device e1000,netdev=net0,bus=pcie.0 \
          -device pxb-pcie,id=pcie.9,bus_nr=9,bus=pcie.0 \
          -device arm-smmuv3,primary-bus=pcie.9,id=smmuv3.2,accel=on,ats=on,ril=off,pasid=on,oas=48,cmdqv=on \
          -device pcie-root-port,id=pcie.port9,bus=pcie.9,chassis=4,io-reserve=0 \
          -device x3130-upstream,id=upstream9,bus=pcie.port9 \
          -device xio3130-downstream,id=downstream9_1,bus=upstream9,chassis=4,slot=1 \
          -device vfio-pci,host=0012:03:00.1,bus=downstream9_1,id=nic1,iommufd=iommufd0 \
          -netdev user,id=net0,hostfwd=tcp::5558-:22,hostfwd=tcp::5586-:5586 \

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the example command @tdavenvidia .

We are adding 'acs_caps' as a generic option, right? The acs_caps is generic option for all PCIe.

Yeah, that much makes sense to me (that if you set acs-caps=0 for a port in qemu options, it will be read-only), but what I'm really asking is more like:

Is the new behavior (of forcing the ACS to be read-only for all downstream ports, even if the user set acs-caps to something > 0 for that downstream port in their qemu options) something that'd be generically applicable and worthy of some global qemu configuration option that would make the 'overriding' behavior clearer to the end-user (like "block-guest-writes" or something)?

Admin who is launching QEMU and using ACS should know what they are doing. Like kernel which allows used to use 'config_acs' kernel parameter , acs_caps serves the same purpose, for example in this case to allow p2p between pcie devices e.g. GPU and NIC.

That's reasonable. My only concern was that it seemed to me that this could be viewed as unexpected behavior, if the user sets acs-caps=nonzero on a downstream port in their qemu opts, and then it is silently overridden to be 0 here - but if you think that users would only ever really be touching that option if they already have themselves reviewed how it works in their qemu source, I think your code comment on line 1218 is probably sufficient.

(Let me know if you think I am misinterpreting anything here - the documentation on these options is a bit sparse, and I have not experimented with them much as a qemu user before.)

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The goal here is to allow user to configure ACS settings for the PCIe device before VM kernel loads and keep preserve those ACS bits by making them read-only , that way the intended p2p works out of the box. Otherwise there is no point of having 'acs-caps' qemu parameter because linux kernel changes the ACS for PCI device anyways for IOMMU isolation.

Hint: check linux kernel pci_std_enable_acs()

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That makes sense, thank you for the clarification.

pci_set_word(dev->wmask + offset + PCI_ACS_CTRL, 0);
} else {
pci_set_word(dev->wmask + offset + PCI_ACS_CTRL, cap_bits);
}

pci_set_word(dev->config + offset + PCI_ACS_CTRL, ctrl_bits);

return 0;
}

void pcie_acs_reset(PCIDevice *dev)
void pcie_acs_reset(PCIDevice *dev, uint16_t val)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: can we name this something more descriptive than val, if it doesn't need to be that generically named? perhaps caps or acs-caps to keep it in line with other uses?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"val" refers to as a PCIe ACS Register value, so IMO it is fine.

{
if (dev->exp.acs_cap) {
pci_set_word(dev->config + dev->exp.acs_cap + PCI_ACS_CTRL, 0);
pci_set_word(dev->config + dev->exp.acs_cap + PCI_ACS_CTRL, val);
}
}

Expand Down
1 change: 1 addition & 0 deletions include/hw/arm/virt.h
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,7 @@ struct VirtMachineState {
CXLState cxl_devices_state;
bool legacy_smmuv3_present;
bool pci_preserve_config;
bool grace_pcie_mmio_identity;
};

#define VIRT_ECAM_ID(high) (high ? VIRT_HIGH_PCIE_ECAM : VIRT_PCIE_ECAM)
Expand Down
4 changes: 2 additions & 2 deletions include/hw/pci/pcie.h
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,8 @@ void pcie_add_capability(PCIDevice *dev,
uint16_t offset, uint16_t size);
void pcie_sync_bridge_lnk(PCIDevice *dev);

void pcie_acs_init(PCIDevice *dev, uint16_t offset);
void pcie_acs_reset(PCIDevice *dev);
int pcie_acs_init(PCIDevice *dev, uint16_t offset, uint16_t ctrl_bits, Error **errp);
void pcie_acs_reset(PCIDevice *dev, uint16_t val);

void pcie_ari_init(PCIDevice *dev, uint16_t offset);
void pcie_dev_ser_num_init(PCIDevice *dev, uint16_t offset, uint64_t ser_num);
Expand Down
1 change: 1 addition & 0 deletions include/hw/pci/pcie_port.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ struct PCIEPort {

/* pci express switch port */
uint8_t port;
uint16_t acs_caps;
};

void pcie_port_init_reg(PCIDevice *d);
Expand Down