From 8b38f5150206a8c6b6e8b4859dfb4d7f131ad3e3 Mon Sep 17 00:00:00 2001
From: Daniel Drake <drake@endlessm.com>
Date: Tue, 4 Jun 2019 14:51:21 +0800
Subject: [PATCH 01/26] ZEN: PCI: Add Intel remapped NVMe device support

Contains:
  - PCI: Add Intel remapped NVMe device support

    Consumer products that are configured by default to run the Intel SATA AHCI
    controller in "RAID" or "Intel RST Premium With Intel Optane System
    Acceleration" mode are becoming increasingly prevalent.

    Unde this mode, NVMe devices are remapped into the SATA device and become
    hidden from the PCI bus, which means that Linux users cannot access their
    storage devices unless they go into the firmware setup menu to revert back
    to AHCI mode - assuming such option is available. Lack of support for this
    mode is also causing complications for vendors who distribute Linux.

    Add support for the remapped NVMe mode by creating a virtual PCI bus,
    where the AHCI and NVMe devices are presented separately, allowing the
    ahci and nvme drivers to bind in the normal way.

    Unfortunately the NVMe device configuration space is inaccesible under
    this scheme, so we provide a fake one, and hope that no DeviceID-based
    quirks are needed. The interrupt is shared between the AHCI and NVMe
    devices.

    Allow pci_real_dma_dev() to traverse back to the real DMA device from
    the PCI devices created on our virtual bus, in case the iommu driver
    will be involved with data transfers here.

    The existing ahci driver is modified to not claim devices where remapped
    NVMe devices are present, allowing this new driver to step in.

    The details of the remapping scheme came from patches previously
    posted by Dan Williams and the resulting discussion.

    https://phabricator.endlessm.com/T24358
    https://phabricator.endlessm.com/T29119

    Signed-off-by: Daniel Drake <drake@endlessm.com>

  - PCI: Fix order of remapped NVMe devices

Signed-off-by: Kai Krakow <kai@kaishome.de>
---
 arch/x86/include/asm/pci.h                |   6 +
 arch/x86/pci/common.c                     |   7 +-
 drivers/ata/ahci.c                        |  23 +-
 drivers/pci/controller/Makefile           |   6 +
 drivers/pci/controller/intel-nvme-remap.c | 462 ++++++++++++++++++++++
 5 files changed, 488 insertions(+), 16 deletions(-)
 create mode 100644 drivers/pci/controller/intel-nvme-remap.c

diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index b3ab80a03365cf..5e883b397ff3f2 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -26,6 +26,7 @@ struct pci_sysdata {
 #if IS_ENABLED(CONFIG_VMD)
 	struct pci_dev	*vmd_dev;	/* VMD Device if in Intel VMD domain */
 #endif
+	struct pci_dev	*nvme_remap_dev;	/* AHCI Device if NVME remapped bus */
 };
 
 extern int pci_routeirq;
@@ -69,6 +70,11 @@ static inline bool is_vmd(struct pci_bus *bus)
 #define is_vmd(bus)		false
 #endif /* CONFIG_VMD */
 
+static inline bool is_nvme_remap(struct pci_bus *bus)
+{
+	return to_pci_sysdata(bus)->nvme_remap_dev != NULL;
+}
+
 /* Can be used to override the logic in pci_scan_bus for skipping
    already-configured bus numbers - to be used for buggy BIOSes
    or architectures with incomplete PCI setup by the loader */
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index ddb798603201ef..7c20387d82029a 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -723,12 +723,15 @@ int pci_ext_cfg_avail(void)
 		return 0;
 }
 
-#if IS_ENABLED(CONFIG_VMD)
 struct pci_dev *pci_real_dma_dev(struct pci_dev *dev)
 {
+#if IS_ENABLED(CONFIG_VMD)
 	if (is_vmd(dev->bus))
 		return to_pci_sysdata(dev->bus)->vmd_dev;
+#endif
+
+	if (is_nvme_remap(dev->bus))
+		return to_pci_sysdata(dev->bus)->nvme_remap_dev;
 
 	return dev;
 }
-#endif
diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index 7a7f88b3fa2b18..cb26ab099da2bd 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -1672,7 +1672,7 @@ static irqreturn_t ahci_thunderx_irq_handler(int irq, void *dev_instance)
 }
 #endif
 
-static void ahci_remap_check(struct pci_dev *pdev, int bar,
+static int ahci_remap_check(struct pci_dev *pdev, int bar,
 		struct ahci_host_priv *hpriv)
 {
 	int i;
@@ -1685,7 +1685,7 @@ static void ahci_remap_check(struct pci_dev *pdev, int bar,
 	    pci_resource_len(pdev, bar) < SZ_512K ||
 	    bar != AHCI_PCI_BAR_STANDARD ||
 	    !(readl(hpriv->mmio + AHCI_VSCAP) & 1))
-		return;
+		return 0;
 
 	cap = readq(hpriv->mmio + AHCI_REMAP_CAP);
 	for (i = 0; i < AHCI_MAX_REMAP; i++) {
@@ -1700,18 +1700,11 @@ static void ahci_remap_check(struct pci_dev *pdev, int bar,
 	}
 
 	if (!hpriv->remapped_nvme)
-		return;
-
-	dev_warn(&pdev->dev, "Found %u remapped NVMe devices.\n",
-		 hpriv->remapped_nvme);
-	dev_warn(&pdev->dev,
-		 "Switch your BIOS from RAID to AHCI mode to use them.\n");
+		return 0;
 
-	/*
-	 * Don't rely on the msi-x capability in the remap case,
-	 * share the legacy interrupt across ahci and remapped devices.
-	 */
-	hpriv->flags |= AHCI_HFLAG_NO_MSI;
+	/* Abort probe, allowing intel-nvme-remap to step in when available */
+	dev_info(&pdev->dev, "Device will be handled by intel-nvme-remap.\n");
+	return -ENODEV;
 }
 
 static int ahci_get_irq_vector(struct ata_host *host, int port)
@@ -1975,7 +1968,9 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 		return -ENOMEM;
 
 	/* detect remapped nvme devices */
-	ahci_remap_check(pdev, ahci_pci_bar, hpriv);
+	rc = ahci_remap_check(pdev, ahci_pci_bar, hpriv);
+	if (rc)
+		return rc;
 
 	sysfs_add_file_to_group(&pdev->dev.kobj,
 				&dev_attr_remapped_nvme.attr,
diff --git a/drivers/pci/controller/Makefile b/drivers/pci/controller/Makefile
index 038ccbd9e3ba23..de5e4f5145af8d 100644
--- a/drivers/pci/controller/Makefile
+++ b/drivers/pci/controller/Makefile
@@ -1,4 +1,10 @@
 # SPDX-License-Identifier: GPL-2.0
+ifdef CONFIG_X86_64
+ifdef CONFIG_SATA_AHCI
+obj-y += intel-nvme-remap.o
+endif
+endif
+
 obj-$(CONFIG_PCIE_CADENCE) += cadence/
 obj-$(CONFIG_PCI_FTPCI100) += pci-ftpci100.o
 obj-$(CONFIG_PCI_IXP4XX) += pci-ixp4xx.o
diff --git a/drivers/pci/controller/intel-nvme-remap.c b/drivers/pci/controller/intel-nvme-remap.c
new file mode 100644
index 00000000000000..e105e6f5cc91d1
--- /dev/null
+++ b/drivers/pci/controller/intel-nvme-remap.c
@@ -0,0 +1,462 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel remapped NVMe device support.
+ *
+ * Copyright (c) 2019 Endless Mobile, Inc.
+ * Author: Daniel Drake <drake@endlessm.com>
+ *
+ * Some products ship by default with the SATA controller in "RAID" or
+ * "Intel RST Premium With Intel Optane System Acceleration" mode. Under this
+ * mode, which we refer to as "remapped NVMe" mode, any installed NVMe
+ * devices disappear from the PCI bus, and instead their I/O memory becomes
+ * available within the AHCI device BARs.
+ *
+ * This scheme is understood to be a way of avoiding usage of the standard
+ * Windows NVMe driver under that OS, instead mandating usage of Intel's
+ * driver instead, which has better power management, and presumably offers
+ * some RAID/disk-caching solutions too.
+ *
+ * Here in this driver, we support the remapped NVMe mode by claiming the
+ * AHCI device and creating a fake PCIe root port. On the new bus, the
+ * original AHCI device is exposed with only minor tweaks. Then, fake PCI
+ * devices corresponding to the remapped NVMe devices are created. The usual
+ * ahci and nvme drivers are then expected to bind to these devices and
+ * operate as normal.
+ *
+ * The PCI configuration space for the NVMe devices is completely
+ * unavailable, so we fake a minimal one and hope for the best.
+ *
+ * Interrupts are shared between the AHCI and NVMe devices. For simplicity,
+ * we only support the legacy interrupt here, although MSI support
+ * could potentially be added later.
+ */
+
+#define MODULE_NAME "intel-nvme-remap"
+
+#include <linux/ahci-remap.h>
+#include <linux/irq.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+
+#define AHCI_PCI_BAR_STANDARD 5
+
+struct nvme_remap_dev {
+	struct pci_dev		*dev;		/* AHCI device */
+	struct pci_bus		*bus;		/* our fake PCI bus */
+	struct pci_sysdata	sysdata;
+	int			irq_base;	/* our fake interrupts */
+
+	/*
+	 * When we detect an all-ones write to a BAR register, this flag
+	 * is set, so that we return the BAR size on the next read (a
+	 * standard PCI behaviour).
+	 * This includes the assumption that an all-ones BAR write is
+	 * immediately followed by a read of the same register.
+	 */
+	bool			bar_sizing;
+
+	/*
+	 * Resources copied from the AHCI device, to be regarded as
+	 * resources on our fake bus.
+	 */
+	struct resource		ahci_resources[PCI_NUM_RESOURCES];
+
+	/* Resources corresponding to the NVMe devices. */
+	struct resource		remapped_dev_mem[AHCI_MAX_REMAP];
+
+	/* Number of remapped NVMe devices found. */
+	int			num_remapped_devices;
+};
+
+static inline struct nvme_remap_dev *nrdev_from_bus(struct pci_bus *bus)
+{
+	return container_of(bus->sysdata, struct nvme_remap_dev, sysdata);
+}
+
+
+/******** PCI configuration space **********/
+
+/*
+ * Helper macros for tweaking returned contents of PCI configuration space.
+ *
+ * value contains len bytes of data read from reg.
+ * If fixup_reg is included in that range, fix up the contents of that
+ * register to fixed_value.
+ */
+#define NR_FIX8(fixup_reg, fixed_value) do { \
+		if (reg <= fixup_reg && fixup_reg < reg + len) \
+			((u8 *) value)[fixup_reg - reg] = (u8) (fixed_value); \
+	} while (0)
+
+#define NR_FIX16(fixup_reg, fixed_value) do { \
+		NR_FIX8(fixup_reg, fixed_value); \
+		NR_FIX8(fixup_reg + 1, fixed_value >> 8); \
+	} while (0)
+
+#define NR_FIX24(fixup_reg, fixed_value) do { \
+		NR_FIX8(fixup_reg, fixed_value); \
+		NR_FIX8(fixup_reg + 1, fixed_value >> 8); \
+		NR_FIX8(fixup_reg + 2, fixed_value >> 16); \
+	} while (0)
+
+#define NR_FIX32(fixup_reg, fixed_value) do { \
+		NR_FIX16(fixup_reg, (u16) fixed_value); \
+		NR_FIX16(fixup_reg + 2, fixed_value >> 16); \
+	} while (0)
+
+/*
+ * Read PCI config space of the slot 0 (AHCI) device.
+ * We pass through the read request to the underlying device, but
+ * tweak the results in some cases.
+ */
+static int nvme_remap_pci_read_slot0(struct pci_bus *bus, int reg,
+				     int len, u32 *value)
+{
+	struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
+	struct pci_bus *ahci_dev_bus = nrdev->dev->bus;
+	int ret;
+
+	ret = ahci_dev_bus->ops->read(ahci_dev_bus, nrdev->dev->devfn,
+				      reg, len, value);
+	if (ret)
+		return ret;
+
+	/*
+	 * Adjust the device class, to prevent this driver from attempting to
+	 * additionally probe the device we're simulating here.
+	 */
+	NR_FIX24(PCI_CLASS_PROG, PCI_CLASS_STORAGE_SATA_AHCI);
+
+	/*
+	 * Unset interrupt pin, otherwise ACPI tries to find routing
+	 * info for our virtual IRQ, fails, and complains.
+	 */
+	NR_FIX8(PCI_INTERRUPT_PIN, 0);
+
+	/*
+	 * Truncate the AHCI BAR to not include the region that covers the
+	 * hidden devices. This will cause the ahci driver to successfully
+	 * probe th new device (instead of handing it over to this driver).
+	 */
+	if (nrdev->bar_sizing) {
+		NR_FIX32(PCI_BASE_ADDRESS_5, ~(SZ_16K - 1));
+		nrdev->bar_sizing = false;
+	}
+
+	return PCIBIOS_SUCCESSFUL;
+}
+
+/*
+ * Read PCI config space of a remapped device.
+ * Since the original PCI config space is inaccessible, we provide a minimal,
+ * fake config space instead.
+ */
+static int nvme_remap_pci_read_remapped(struct pci_bus *bus, unsigned int port,
+					int reg, int len, u32 *value)
+{
+	struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
+	struct resource *remapped_mem;
+
+	if (port > nrdev->num_remapped_devices)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	*value = 0;
+	remapped_mem = &nrdev->remapped_dev_mem[port - 1];
+
+	/* Set a Vendor ID, otherwise Linux assumes no device is present */
+	NR_FIX16(PCI_VENDOR_ID, PCI_VENDOR_ID_INTEL);
+
+	/* Always appear on & bus mastering */
+	NR_FIX16(PCI_COMMAND, PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER);
+
+	/* Set class so that nvme driver probes us */
+	NR_FIX24(PCI_CLASS_PROG, PCI_CLASS_STORAGE_EXPRESS);
+
+	if (nrdev->bar_sizing) {
+		NR_FIX32(PCI_BASE_ADDRESS_0,
+			 ~(resource_size(remapped_mem) - 1));
+		nrdev->bar_sizing = false;
+	} else {
+		resource_size_t mem_start = remapped_mem->start;
+
+		mem_start |= PCI_BASE_ADDRESS_MEM_TYPE_64;
+		NR_FIX32(PCI_BASE_ADDRESS_0, mem_start);
+		mem_start >>= 32;
+		NR_FIX32(PCI_BASE_ADDRESS_1, mem_start);
+	}
+
+	return PCIBIOS_SUCCESSFUL;
+}
+
+/* Read PCI configuration space. */
+static int nvme_remap_pci_read(struct pci_bus *bus, unsigned int devfn,
+			       int reg, int len, u32 *value)
+{
+	if (PCI_SLOT(devfn) == 0)
+		return nvme_remap_pci_read_slot0(bus, reg, len, value);
+	else
+		return nvme_remap_pci_read_remapped(bus, PCI_SLOT(devfn),
+						    reg, len, value);
+}
+
+/*
+ * Write PCI config space of the slot 0 (AHCI) device.
+ * Apart from the special case of BAR sizing, we disable all writes.
+ * Otherwise, the ahci driver could make changes (e.g. unset PCI bus master)
+ * that would affect the operation of the NVMe devices.
+ */
+static int nvme_remap_pci_write_slot0(struct pci_bus *bus, int reg,
+				      int len, u32 value)
+{
+	struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
+	struct pci_bus *ahci_dev_bus = nrdev->dev->bus;
+
+	if (reg >= PCI_BASE_ADDRESS_0 && reg <= PCI_BASE_ADDRESS_5) {
+		/*
+		 * Writing all-ones to a BAR means that the size of the
+		 * memory region is being checked. Flag this so that we can
+		 * reply with an appropriate size on the next read.
+		 */
+		if (value == ~0)
+			nrdev->bar_sizing = true;
+
+		return ahci_dev_bus->ops->write(ahci_dev_bus,
+						nrdev->dev->devfn,
+						reg, len, value);
+	}
+
+	return PCIBIOS_SET_FAILED;
+}
+
+/*
+ * Write PCI config space of a remapped device.
+ * Since the original PCI config space is inaccessible, we reject all
+ * writes, except for the special case of BAR probing.
+ */
+static int nvme_remap_pci_write_remapped(struct pci_bus *bus,
+					 unsigned int port,
+					 int reg, int len, u32 value)
+{
+	struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
+
+	if (port > nrdev->num_remapped_devices)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	/*
+	 * Writing all-ones to a BAR means that the size of the memory
+	 * region is being checked. Flag this so that we can reply with
+	 * an appropriate size on the next read.
+	 */
+	if (value == ~0 && reg >= PCI_BASE_ADDRESS_0
+			&& reg <= PCI_BASE_ADDRESS_5) {
+		nrdev->bar_sizing = true;
+		return PCIBIOS_SUCCESSFUL;
+	}
+
+	return PCIBIOS_SET_FAILED;
+}
+
+/* Write PCI configuration space. */
+static int nvme_remap_pci_write(struct pci_bus *bus, unsigned int devfn,
+				int reg, int len, u32 value)
+{
+	if (PCI_SLOT(devfn) == 0)
+		return nvme_remap_pci_write_slot0(bus, reg, len, value);
+	else
+		return nvme_remap_pci_write_remapped(bus, PCI_SLOT(devfn),
+						     reg, len, value);
+}
+
+static struct pci_ops nvme_remap_pci_ops = {
+	.read	= nvme_remap_pci_read,
+	.write	= nvme_remap_pci_write,
+};
+
+
+/******** Initialization & exit **********/
+
+/*
+ * Find a PCI domain ID to use for our fake bus.
+ * Start at 0x10000 to not clash with ACPI _SEG domains (16 bits).
+ */
+static int find_free_domain(void)
+{
+	int domain = 0xffff;
+	struct pci_bus *bus = NULL;
+
+	while ((bus = pci_find_next_bus(bus)) != NULL)
+		domain = max_t(int, domain, pci_domain_nr(bus));
+
+	return domain + 1;
+}
+
+static int find_remapped_devices(struct nvme_remap_dev *nrdev,
+				 struct list_head *resources)
+{
+	void __iomem *mmio;
+	int i, count = 0;
+	u32 cap;
+
+	mmio = pcim_iomap(nrdev->dev, AHCI_PCI_BAR_STANDARD,
+			  pci_resource_len(nrdev->dev,
+					   AHCI_PCI_BAR_STANDARD));
+	if (!mmio)
+		return -ENODEV;
+
+	/* Check if this device might have remapped nvme devices. */
+	if (pci_resource_len(nrdev->dev, AHCI_PCI_BAR_STANDARD) < SZ_512K ||
+	    !(readl(mmio + AHCI_VSCAP) & 1))
+		return -ENODEV;
+
+	cap = readq(mmio + AHCI_REMAP_CAP);
+	for (i = AHCI_MAX_REMAP-1; i >= 0; i--) {
+		struct resource *remapped_mem;
+
+		if ((cap & (1 << i)) == 0)
+			continue;
+		if (readl(mmio + ahci_remap_dcc(i))
+				!= PCI_CLASS_STORAGE_EXPRESS)
+			continue;
+
+		/* We've found a remapped device */
+		remapped_mem = &nrdev->remapped_dev_mem[count++];
+		remapped_mem->start =
+			pci_resource_start(nrdev->dev, AHCI_PCI_BAR_STANDARD)
+			+ ahci_remap_base(i);
+		remapped_mem->end = remapped_mem->start
+			+ AHCI_REMAP_N_SIZE - 1;
+		remapped_mem->flags = IORESOURCE_MEM | IORESOURCE_PCI_FIXED;
+		pci_add_resource(resources, remapped_mem);
+	}
+
+	pcim_iounmap(nrdev->dev, mmio);
+
+	if (count == 0)
+		return -ENODEV;
+
+	nrdev->num_remapped_devices = count;
+	dev_info(&nrdev->dev->dev, "Found %d remapped NVMe devices\n",
+		 nrdev->num_remapped_devices);
+	return 0;
+}
+
+static void nvme_remap_remove_root_bus(void *data)
+{
+	struct pci_bus *bus = data;
+
+	pci_stop_root_bus(bus);
+	pci_remove_root_bus(bus);
+}
+
+static int nvme_remap_probe(struct pci_dev *dev,
+			    const struct pci_device_id *id)
+{
+	struct nvme_remap_dev *nrdev;
+	LIST_HEAD(resources);
+	int i;
+	int ret;
+	struct pci_dev *child;
+
+	nrdev = devm_kzalloc(&dev->dev, sizeof(*nrdev), GFP_KERNEL);
+	nrdev->sysdata.domain = find_free_domain();
+	nrdev->sysdata.nvme_remap_dev = dev;
+	nrdev->dev = dev;
+	pci_set_drvdata(dev, nrdev);
+
+	ret = pcim_enable_device(dev);
+	if (ret < 0)
+		return ret;
+
+	pci_set_master(dev);
+
+	ret = find_remapped_devices(nrdev, &resources);
+	if (ret)
+		return ret;
+
+	/* Add resources from the original AHCI device */
+	for (i = 0; i < PCI_NUM_RESOURCES; i++) {
+		struct resource *res = &dev->resource[i];
+
+		if (res->start) {
+			struct resource *nr_res = &nrdev->ahci_resources[i];
+
+			nr_res->start = res->start;
+			nr_res->end = res->end;
+			nr_res->flags = res->flags;
+			pci_add_resource(&resources, nr_res);
+		}
+	}
+
+	/* Create virtual interrupts */
+	nrdev->irq_base = devm_irq_alloc_descs(&dev->dev, -1, 0,
+					       nrdev->num_remapped_devices + 1,
+					       0);
+	if (nrdev->irq_base < 0)
+		return nrdev->irq_base;
+
+	/* Create and populate PCI bus */
+	nrdev->bus = pci_create_root_bus(&dev->dev, 0, &nvme_remap_pci_ops,
+					 &nrdev->sysdata, &resources);
+	if (!nrdev->bus)
+		return -ENODEV;
+
+	if (devm_add_action_or_reset(&dev->dev, nvme_remap_remove_root_bus,
+				     nrdev->bus))
+		return -ENOMEM;
+
+	/* We don't support sharing MSI interrupts between these devices */
+	nrdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_MSI;
+
+	pci_scan_child_bus(nrdev->bus);
+
+	list_for_each_entry(child, &nrdev->bus->devices, bus_list) {
+		/*
+		 * Prevent PCI core from trying to move memory BARs around.
+		 * The hidden NVMe devices are at fixed locations.
+		 */
+		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
+			struct resource *res = &child->resource[i];
+
+			if (res->flags & IORESOURCE_MEM)
+				res->flags |= IORESOURCE_PCI_FIXED;
+		}
+
+		/* Share the legacy IRQ between all devices */
+		child->irq = dev->irq;
+	}
+
+	pci_assign_unassigned_bus_resources(nrdev->bus);
+	pci_bus_add_devices(nrdev->bus);
+
+	return 0;
+}
+
+static const struct pci_device_id nvme_remap_ids[] = {
+	/*
+	 * Match all Intel RAID controllers.
+	 *
+	 * There's overlap here with the set of devices detected by the ahci
+	 * driver, but ahci will only successfully probe when there
+	 * *aren't* any remapped NVMe devices, and this driver will only
+	 * successfully probe when there *are* remapped NVMe devices that
+	 * need handling.
+	 */
+	{
+		PCI_VDEVICE(INTEL, PCI_ANY_ID),
+		.class = PCI_CLASS_STORAGE_RAID << 8,
+		.class_mask = 0xffffff00,
+	},
+	{0,}
+};
+MODULE_DEVICE_TABLE(pci, nvme_remap_ids);
+
+static struct pci_driver nvme_remap_drv = {
+	.name		= MODULE_NAME,
+	.id_table	= nvme_remap_ids,
+	.probe		= nvme_remap_probe,
+};
+module_pci_driver(nvme_remap_drv);
+
+MODULE_AUTHOR("Daniel Drake <drake@endlessm.com>");
+MODULE_LICENSE("GPL v2");

From ebc830040eebe01a39341c0952975f7b0ff88514 Mon Sep 17 00:00:00 2001
From: Sultan Alsawaf <sultan@kerneltoast.com>
Date: Sun, 8 Mar 2020 00:31:35 -0800
Subject: [PATCH 02/26] ZEN: Disable stack conservation for GCC

There's plenty of room on the stack for a few more inlined bytes here
and there. The measured stack usage at runtime is still safe without
this, and performance is surely improved at a microscopic level, so
remove it.

Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>
---
 Makefile | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/Makefile b/Makefile
index a082a1d7c7d9b4..22318c98072274 100644
--- a/Makefile
+++ b/Makefile
@@ -1067,11 +1067,6 @@ KBUILD_CFLAGS	+= -fno-strict-overflow
 # Make sure -fstack-check isn't enabled (like gentoo apparently did)
 KBUILD_CFLAGS  += -fno-stack-check
 
-# conserve stack if available
-ifdef CONFIG_CC_IS_GCC
-KBUILD_CFLAGS   += -fconserve-stack
-endif
-
 # Ensure compilers do not transform certain loops into calls to wcslen()
 KBUILD_CFLAGS += -fno-builtin-wcslen
 

From 32199ce876671e6bc88625807e73590e2115f3b6 Mon Sep 17 00:00:00 2001
From: Steven Barrett <steven@liquorix.net>
Date: Sat, 4 Mar 2023 17:47:36 -0600
Subject: [PATCH 03/26] ZEN: arch/x86: Disable AVX2 and tree vectorization

From ClearLinux's own patches, disable both AVX2 and tree vectorization
when using O3 and higher than generic amd64 architectures.

Source: https://github.com/clearlinux-pkgs/linux/blob/main/0133-novector.patch
Signed-off-by: Kai Krakow <kai@kaishome.de>
---
 arch/x86/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 1a27efcf3c205a..2f06bd6847edd1 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -75,7 +75,7 @@ export BITS
 #
 #    https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383
 #
-KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx -mno-sse4a
+KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx -mno-sse4a -mno-avx2 -fno-tree-vectorize
 KBUILD_RUSTFLAGS += --target=$(objtree)/scripts/target.json
 KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2
 

From a792cf2e6a60f3478c5b6e7f1ace898d0d339625 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Thu, 2 Jun 2016 23:36:32 -0500
Subject: [PATCH 04/26] ZEN: Initialize ata before graphics

ATA init is the long pole in the boot process, and its asynchronous.
move the graphics init after it so that ata and graphics initialize
in parallel

Signed-off-by: Kai Krakow <kai@kaishome.de>
---
 drivers/Makefile | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/Makefile b/drivers/Makefile
index 8e1ffa4358d5f1..2a9eec99c1f7c3 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -64,14 +64,8 @@ obj-y				+= char/
 # iommu/ comes before gpu as gpu are using iommu controllers
 obj-y				+= iommu/
 
-# gpu/ comes after char for AGP vs DRM startup and after iommu
-obj-y				+= gpu/
-
 obj-$(CONFIG_CONNECTOR)		+= connector/
 
-# i810fb depends on char/agp/
-obj-$(CONFIG_FB_I810)           += video/fbdev/i810/
-
 obj-$(CONFIG_PARPORT)		+= parport/
 obj-y				+= base/ block/ misc/ mfd/ nfc/
 obj-$(CONFIG_LIBNVDIMM)		+= nvdimm/
@@ -83,6 +77,13 @@ obj-y				+= macintosh/
 obj-y				+= scsi/
 obj-y				+= nvme/
 obj-$(CONFIG_ATA)		+= ata/
+
+# gpu/ comes after char for AGP vs DRM startup and after iommu
+obj-y				+= gpu/
+
+# i810fb depends on char/agp/
+obj-$(CONFIG_FB_I810)           += video/fbdev/i810/
+
 obj-$(CONFIG_TARGET_CORE)	+= target/
 obj-$(CONFIG_MTD)		+= mtd/
 obj-$(CONFIG_SPI)		+= spi/

From b719954c08b2adb73a4e2bcc1a1681d7290858a1 Mon Sep 17 00:00:00 2001
From: Kenny Levinsen <kl@kl.wtf>
Date: Sun, 27 Dec 2020 14:43:13 +0000
Subject: [PATCH 05/26] ZEN: Input: evdev - use call_rcu when detaching client

Significant time was spent on synchronize_rcu in evdev_detach_client
when applications closed evdev devices. Switching VT away from a
graphical environment commonly leads to mass input device closures,
which could lead to noticable delays on systems with many input devices.

Replace synchronize_rcu with call_rcu, deferring reclaim of the evdev
client struct till after the RCU grace period instead of blocking the
calling application.

While this does not solve all slow evdev fd closures, it takes care of a
good portion of them, including this simple test:

	#include <fcntl.h>
	#include <unistd.h>

	int main(int argc, char *argv[])
	{
		int idx, fd;
		const char *path = "/dev/input/event0";
		for (idx = 0; idx < 1000; idx++) {
			if ((fd = open(path, O_RDWR)) == -1) {
				return -1;
			}
			close(fd);
		}
		return 0;
	}

Time to completion of above test when run locally:

	Before: 0m27.111s
	After:  0m0.018s

Signed-off-by: Kenny Levinsen <kl@kl.wtf>
---
 drivers/input/evdev.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c
index 90ff6be85cf466..15159c1cf6e1a0 100644
--- a/drivers/input/evdev.c
+++ b/drivers/input/evdev.c
@@ -46,6 +46,7 @@ struct evdev_client {
 	struct fasync_struct *fasync;
 	struct evdev *evdev;
 	struct list_head node;
+	struct rcu_head rcu;
 	enum input_clock_type clk_type;
 	bool revoked;
 	unsigned long *evmasks[EV_CNT];
@@ -368,13 +369,22 @@ static void evdev_attach_client(struct evdev *evdev,
 	spin_unlock(&evdev->client_lock);
 }
 
+static void evdev_reclaim_client(struct rcu_head *rp)
+{
+	struct evdev_client *client = container_of(rp, struct evdev_client, rcu);
+	unsigned int i;
+	for (i = 0; i < EV_CNT; ++i)
+		bitmap_free(client->evmasks[i]);
+	kvfree(client);
+}
+
 static void evdev_detach_client(struct evdev *evdev,
 				struct evdev_client *client)
 {
 	spin_lock(&evdev->client_lock);
 	list_del_rcu(&client->node);
 	spin_unlock(&evdev->client_lock);
-	synchronize_rcu();
+	call_rcu(&client->rcu, evdev_reclaim_client);
 }
 
 static int evdev_open_device(struct evdev *evdev)
@@ -427,7 +437,6 @@ static int evdev_release(struct inode *inode, struct file *file)
 {
 	struct evdev_client *client = file->private_data;
 	struct evdev *evdev = client->evdev;
-	unsigned int i;
 
 	mutex_lock(&evdev->mutex);
 
@@ -439,11 +448,6 @@ static int evdev_release(struct inode *inode, struct file *file)
 
 	evdev_detach_client(evdev, client);
 
-	for (i = 0; i < EV_CNT; ++i)
-		bitmap_free(client->evmasks[i]);
-
-	kvfree(client);
-
 	evdev_close_device(evdev);
 
 	return 0;
@@ -486,7 +490,6 @@ static int evdev_open(struct inode *inode, struct file *file)
 
  err_free_client:
 	evdev_detach_client(evdev, client);
-	kvfree(client);
 	return error;
 }
 

From b8bff9c6e28a3c20d80d69f6e680741c22f938fe Mon Sep 17 00:00:00 2001
From: Steven Barrett <steven@liquorix.net>
Date: Thu, 27 Apr 2023 14:43:57 -0500
Subject: [PATCH 06/26] ZEN: Set default max map count to (INT_MAX - 5)

Per [Fedora][1], they intend to change the default max map count for
their distribution to improve OOTB compatibility with games played
through Steam/Proton.  The value they picked comes from the Steam Deck,
which defaults to INT_MAX - MAPCOUNT_ELF_CORE_MARGIN.

Since most ZEN and Liquorix users probably play games, follow Valve's
lead and raise this value to their default.

[1]: https://fedoraproject.org/wiki/Changes/IncreaseVmMaxMapCount

Signed-off-by: Kai Krakow <kai@kaishome.de>
---
 include/linux/mm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7c79b3369b82c9..819daddd3a493b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -201,7 +201,7 @@ static inline void __mm_zero_struct_page(struct page *page)
  * that.
  */
 #define MAPCOUNT_ELF_CORE_MARGIN	(5)
-#define DEFAULT_MAX_MAP_COUNT	(USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN)
+#define DEFAULT_MAX_MAP_COUNT	(INT_MAX - MAPCOUNT_ELF_CORE_MARGIN)
 
 extern int sysctl_max_map_count;
 

From 00fb3218df4e6e177dd44b988e53cbdb9dad1ea0 Mon Sep 17 00:00:00 2001
From: Sultan Alsawaf <sultan@kerneltoast.com>
Date: Sun, 19 Apr 2020 19:59:18 -0700
Subject: [PATCH 07/26] ZEN: mm: Stop kswapd early when nothing's waiting for
 it to free pages

Contains:
  - mm: Stop kswapd early when nothing's waiting for it to free pages

    Keeping kswapd running when all the failed allocations that invoked it
    are satisfied incurs a high overhead due to unnecessary page eviction
    and writeback, as well as spurious VM pressure events to various
    registered shrinkers. When kswapd doesn't need to work to make an
    allocation succeed anymore, stop it prematurely to save resources.

    Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>

  - mm: Don't stop kswapd on a per-node basis when there are no waiters

    The page allocator wakes all kswapds in an allocation context's allowed
    nodemask in the slow path, so it doesn't make sense to have the kswapd-
    waiter count per each NUMA node. Instead, it should be a global counter
    to stop all kswapds when there are no failed allocation requests.

    Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>

  - mm: Increment kswapd_waiters for throttled direct reclaimers

    Throttled direct reclaimers will wake up kswapd and wait for kswapd to
    satisfy their page allocation request, even when the failed allocation
    lacks the __GFP_KSWAPD_RECLAIM flag in its gfp mask. As a result, kswapd
    may think that there are no waiters and thus exit prematurely, causing
    throttled direct reclaimers lacking __GFP_KSWAPD_RECLAIM to stall on
    waiting for kswapd to wake them up. Incrementing the kswapd_waiters
    counter when such direct reclaimers become throttled fixes the problem.

    Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>

Signed-off-by: Kai Krakow <kai@kaishome.de>
---
 mm/internal.h   |  1 +
 mm/page_alloc.c | 17 ++++++++++++++---
 mm/vmscan.c     | 19 +++++++++++++------
 3 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index 1561fc2ff5b832..4f1282289372bb 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -823,6 +823,7 @@ void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags);
 extern bool free_pages_prepare(struct page *page, unsigned int order);
 
 extern int user_min_free_kbytes;
+extern atomic_long_t kswapd_waiters;
 
 struct page *__alloc_frozen_pages_noprof(gfp_t, unsigned int order, int nid,
 		nodemask_t *);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ed82ee55e66aff..677f9bb0428b71 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -91,6 +91,8 @@ typedef int __bitwise fpi_t;
 /* Free the page without taking locks. Rely on trylock only. */
 #define FPI_TRYLOCK		((__force fpi_t)BIT(2))
 
+atomic_long_t kswapd_waiters = ATOMIC_LONG_INIT(0);
+
 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
 static DEFINE_MUTEX(pcp_batch_high_lock);
 #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
@@ -4640,6 +4642,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	unsigned int cpuset_mems_cookie;
 	unsigned int zonelist_iter_cookie;
 	int reserve_flags;
+	bool woke_kswapd = false;
 
 	if (unlikely(nofail)) {
 		/*
@@ -4699,8 +4702,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 			goto nopage;
 	}
 
-	if (alloc_flags & ALLOC_KSWAPD)
+	if (alloc_flags & ALLOC_KSWAPD) {
+		if (!woke_kswapd) {
+			atomic_long_inc(&kswapd_waiters);
+			woke_kswapd = true;
+		}
 		wake_all_kswapds(order, gfp_mask, ac);
+	}
 
 	/*
 	 * The adjusted alloc_flags might result in immediate success, so try
@@ -4915,9 +4923,12 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 		goto retry;
 	}
 fail:
-	warn_alloc(gfp_mask, ac->nodemask,
-			"page allocation failure: order:%u", order);
 got_pg:
+	if (woke_kswapd)
+		atomic_long_dec(&kswapd_waiters);
+	if (!page)
+		warn_alloc(gfp_mask, ac->nodemask,
+				"page allocation failure: order:%u", order);
 	return page;
 }
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b2fc8b626d3dff..3f0cfcf9cdf410 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -6484,7 +6484,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 	return 0;
 }
 
-static bool allow_direct_reclaim(pg_data_t *pgdat)
+static bool allow_direct_reclaim(pg_data_t *pgdat, bool using_kswapd)
 {
 	struct zone *zone;
 	unsigned long pfmemalloc_reserve = 0;
@@ -6509,6 +6509,10 @@ static bool allow_direct_reclaim(pg_data_t *pgdat)
 
 	wmark_ok = free_pages > pfmemalloc_reserve / 2;
 
+	/* The throttled direct reclaimer is now a kswapd waiter */
+	if (unlikely(!using_kswapd && !wmark_ok))
+		atomic_long_inc(&kswapd_waiters);
+
 	/* kswapd must be awake if processes are being throttled */
 	if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
 		if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL)
@@ -6574,7 +6578,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
 
 		/* Throttle based on the first usable node */
 		pgdat = zone->zone_pgdat;
-		if (allow_direct_reclaim(pgdat))
+		if (allow_direct_reclaim(pgdat, gfp_mask & __GFP_KSWAPD_RECLAIM))
 			goto out;
 		break;
 	}
@@ -6596,11 +6600,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
 	 */
 	if (!(gfp_mask & __GFP_FS))
 		wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
-			allow_direct_reclaim(pgdat), HZ);
+			allow_direct_reclaim(pgdat, true), HZ);
 	else
 		/* Throttle until kswapd wakes the process */
 		wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
-			allow_direct_reclaim(pgdat));
+			allow_direct_reclaim(pgdat, true));
+
+	if (unlikely(!(gfp_mask & __GFP_KSWAPD_RECLAIM)))
+		atomic_long_dec(&kswapd_waiters);
 
 	if (fatal_signal_pending(current))
 		return true;
@@ -7130,14 +7137,14 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
 		 * able to safely make forward progress. Wake them
 		 */
 		if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
-				allow_direct_reclaim(pgdat))
+				allow_direct_reclaim(pgdat, true))
 			wake_up_all(&pgdat->pfmemalloc_wait);
 
 		/* Check if kswapd should be suspending */
 		__fs_reclaim_release(_THIS_IP_);
 		ret = kthread_freezable_should_stop(&was_frozen);
 		__fs_reclaim_acquire(_THIS_IP_);
-		if (was_frozen || ret)
+		if (was_frozen || ret || !atomic_long_read(&kswapd_waiters))
 			break;
 
 		/*

From 29b9083a90d0037e761b2e5de400765f4278d0da Mon Sep 17 00:00:00 2001
From: EXtremeExploit <pedro.montes.alcalde@gmail.com>
Date: Fri, 29 Nov 2024 13:05:27 -0300
Subject: [PATCH 08/26] ZEN: ahci: Disable staggered spinup by default

This patch disabled the staggered spinup used for HDDs.

The goal is to make boot times faster on systems
with the small downside of a small spike in power consumption.

Systems with a bunch of HDDs would see considerable faster boots

This does make sense in the zen kernel as its supposed to be a kernel
specialized for desktop performance, and faster boot times does fit
into that description

Signed-off-by: Pedro Montes Alcalde <pedro.montes.alcalde@gmail.com>
---
 drivers/ata/libahci.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/ata/libahci.c b/drivers/ata/libahci.c
index c79abdfcd7a9b0..59acfa77934b9c 100644
--- a/drivers/ata/libahci.c
+++ b/drivers/ata/libahci.c
@@ -34,7 +34,7 @@
 #include "libata.h"
 
 static int ahci_skip_host_reset;
-int ahci_ignore_sss;
+int ahci_ignore_sss = 1;
 EXPORT_SYMBOL_GPL(ahci_ignore_sss);
 
 module_param_named(skip_host_reset, ahci_skip_host_reset, int, 0444);

From 8c67bdf5ffe9a422467dc5be8eb799ec4f9a53a2 Mon Sep 17 00:00:00 2001
From: "Jan Alexander Steffens (heftig)" <jan.steffens@gmail.com>
Date: Mon, 27 Jan 2020 18:10:06 +0100
Subject: [PATCH 09/26] ZEN: INTERACTIVE: Base config item

Signed-off-by: Kai Krakow <kai@kaishome.de>
---
 init/Kconfig | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/init/Kconfig b/init/Kconfig
index cab3ad28ca49e7..07f6cd50190ff1 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -187,6 +187,12 @@ config THREAD_INFO_IN_TASK
 
 menu "General setup"
 
+config ZEN_INTERACTIVE
+	bool "Tune kernel for interactivity"
+	default y
+	help
+	  Tunes the kernel for responsiveness at the cost of throughput and power usage.
+
 config BROKEN
 	bool
 	help

From 7a2aa2b33eb4df89a1e433bf8dc0295084f03980 Mon Sep 17 00:00:00 2001
From: "Jan Alexander Steffens (heftig)" <jan.steffens@gmail.com>
Date: Mon, 27 Jan 2020 18:11:05 +0100
Subject: [PATCH 10/26] ZEN: INTERACTIVE: Use BFQ as the elevator for SQ
 devices

Signed-off-by: Kai Krakow <kai@kaishome.de>
---
 block/elevator.c | 4 ++++
 init/Kconfig     | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/block/elevator.c b/block/elevator.c
index e2ebfbf107b3af..178c89b3bc9c43 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -741,7 +741,11 @@ void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e,
 void elevator_set_default(struct request_queue *q)
 {
 	struct elv_change_ctx ctx = {
+#if defined(CONFIG_ZEN_INTERACTIVE) && defined(CONFIG_IOSCHED_BFQ)
+		.name = "bfq",
+#else
 		.name = "mq-deadline",
+#endif
 		.no_uevent = true,
 	};
 	int err;
diff --git a/init/Kconfig b/init/Kconfig
index 07f6cd50190ff1..75dde801645500 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -193,6 +193,10 @@ config ZEN_INTERACTIVE
 	help
 	  Tunes the kernel for responsiveness at the cost of throughput and power usage.
 
+	  --- Block Layer ----------------------------------------
+
+	    Default scheduler for SQ..: mq-deadline ->   bfq
+
 config BROKEN
 	bool
 	help

From 816b23e9c87dc64f92700a026ce60477f73d4016 Mon Sep 17 00:00:00 2001
From: "Jan Alexander Steffens (heftig)" <heftig@archlinux.org>
Date: Sat, 2 Aug 2025 04:36:06 +0200
Subject: [PATCH 11/26] block: Clean up elevator_set_default

In case of a multi-queue device, the code pointlessly loaded the
default elevator just to drop it again.

Signed-off-by: Kai Krakow <kai@kaishome.de>
---
 block/elevator.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/block/elevator.c b/block/elevator.c
index 178c89b3bc9c43..4c29d266ce9787 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -762,17 +762,18 @@ void elevator_set_default(struct request_queue *q)
 	 * have multiple queues or mq-deadline is not available, default
 	 * to "none".
 	 */
+	if (q->nr_hw_queues != 1 && !blk_mq_is_shared_tags(q->tag_set->flags))
+		return;
+
 	e = elevator_find_get(ctx.name);
 	if (!e)
 		return;
 
-	if ((q->nr_hw_queues == 1 ||
-			blk_mq_is_shared_tags(q->tag_set->flags))) {
-		err = elevator_change(q, &ctx);
-		if (err < 0)
-			pr_warn("\"%s\" elevator initialization, failed %d, falling back to \"none\"\n",
-					ctx.name, err);
-	}
+	err = elevator_change(q, &ctx);
+	if (err < 0)
+		pr_warn("\"%s\" elevator initialization, failed %d, falling back to \"none\"\n",
+				ctx.name, err);
+
 	elevator_put(e);
 }
 

From 2d79b820e3e3e4572b470292bd36ab0cbbf4bcc3 Mon Sep 17 00:00:00 2001
From: "Jan Alexander Steffens (heftig)" <heftig@archlinux.org>
Date: Mon, 12 Dec 2022 00:03:03 +0100
Subject: [PATCH 12/26] ZEN: INTERACTIVE: Use Kyber as the elevator for MQ
 devices

Fall back straight to none instead of mq-deadline. Some benchmarks in a
[recent paper][1] suggest that mq-deadline has too much lock contention,
hurting throughput and eating CPU waiting for spinlocks.

[1]: https://research.spec.org/icpe_proceedings/2024/proceedings/p154.pdf

Signed-off-by: Kai Krakow <kai@kaishome.de>
---
 block/elevator.c | 4 ++++
 init/Kconfig     | 1 +
 2 files changed, 5 insertions(+)

diff --git a/block/elevator.c b/block/elevator.c
index 4c29d266ce9787..f8b3fc16f304dd 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -763,7 +763,11 @@ void elevator_set_default(struct request_queue *q)
 	 * to "none".
 	 */
 	if (q->nr_hw_queues != 1 && !blk_mq_is_shared_tags(q->tag_set->flags))
+#if defined(CONFIG_ZEN_INTERACTIVE) && defined(CONFIG_MQ_IOSCHED_KYBER)
+		ctx.name = "kyber";
+#else
 		return;
+#endif
 
 	e = elevator_find_get(ctx.name);
 	if (!e)
diff --git a/init/Kconfig b/init/Kconfig
index 75dde801645500..3206ea9118c987 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -196,6 +196,7 @@ config ZEN_INTERACTIVE
 	  --- Block Layer ----------------------------------------
 
 	    Default scheduler for SQ..: mq-deadline ->   bfq
+	    Default scheduler for MQ..:        none ->   kyber
 
 config BROKEN
 	bool

From 401c8c7f974cdaed05afd16a07c108bb587c4d8d Mon Sep 17 00:00:00 2001
From: "Jan Alexander Steffens (heftig)" <jan.steffens@gmail.com>
Date: Mon, 27 Jan 2020 18:21:09 +0100
Subject: [PATCH 13/26] ZEN: INTERACTIVE: Enable background reclaim of
 hugepages

Use [defer+madvise] as default khugepaged defrag strategy:

For some reason, the default strategy to respond to THP fault fallbacks
is still just madvise, meaning stall if the program wants transparent
hugepages, but don't trigger a background reclaim / compaction if THP
begins to fail allocations.  This creates a snowball affect where we
still use the THP code paths, but we almost always fail once a system
has been active and busy for a while.

The option "defer" was created for interactive systems where THP can
still improve performance.  If we have to fallback to a regular page due
to an allocation failure or anything else, we will trigger a background
reclaim and compaction so future THP attempts succeed and previous
attempts eventually have their smaller pages combined without stalling
running applications.

We still want madvise to stall applications that explicitely want THP,
so defer+madvise _does_ make a ton of sense.  Make it the default for
interactive systems, especially if the kernel maintainer left
transparent hugepages on "always".

Reasoning and details in the original patch: https://lwn.net/Articles/711248/

Signed-off-by: Kai Krakow <kai@kaishome.de>
---
 init/Kconfig     | 4 ++++
 mm/huge_memory.c | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/init/Kconfig b/init/Kconfig
index 3206ea9118c987..45eda8b28af869 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -198,6 +198,10 @@ config ZEN_INTERACTIVE
 	    Default scheduler for SQ..: mq-deadline ->   bfq
 	    Default scheduler for MQ..:        none ->   kyber
 
+	  --- Virtual Memory Subsystem ---------------------------
+
+	    Background-reclaim hugepages...:   no   ->   yes
+
 config BROKEN
 	bool
 	help
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 6cba1cb14b23ac..43449e5b51aee8 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -63,7 +63,11 @@ unsigned long transparent_hugepage_flags __read_mostly =
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
 	(1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
 #endif
+#ifdef CONFIG_ZEN_INTERACTIVE
+	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG)|
+#else
 	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
+#endif
 	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
 	(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
 

From 8ab08df4d814d2ddfeabae730ae3de57da3723f2 Mon Sep 17 00:00:00 2001
From: "Jan Alexander Steffens (heftig)" <heftig@archlinux.org>
Date: Tue, 31 Oct 2023 19:03:10 +0100
Subject: [PATCH 14/26] ZEN: INTERACTIVE: Tune EEVDF for interactivity

5.7:
Take "sysctl_sched_nr_migrate" tune from early XanMod builds of 128. As
of 5.7, XanMod uses 256 but that may affect applications that require
timely response to IRQs.

5.15:
Per [a comment][1] on our ZEN INTERACTIVE commit, reducing the cost of
migration causes the system less responsive under high load.  Most
likely the combination of reduced migration cost + the higher number of
tasks that can be migrated at once contributes to this.

To better handle this situation, restore the mainline migration cost
value and also reduce the max number of tasks that can be migrated in
batch from 128 to 64.

If this doesn't help, we'll restore the reduced migration cost and keep
total number of tasks that can be migrated at once to 32.

[1]: https://github.com/zen-kernel/zen-kernel/commit/be5ba234ca0a5aabe74bfc7e1f636f085bd3823c#commitcomment-63159674

6.6:
Port the tuning to EEVDF, which removed a couple of settings.

6.7:
Instead of increasing the number of tasks that migrate at once, migrate
the amount acceptable for PREEMPT_RT, but reduce the cost so migrations
occur more often.

This should make CFS/EEVDF behave more like out-of-tree schedulers that
aggressively use idle cores to reduce latency, but without the jank
caused by rebalancing too many tasks at once.

Signed-off-by: Kai Krakow <kai@kaishome.de>
---
 init/Kconfig         |  7 +++++++
 kernel/sched/fair.c  | 13 +++++++++++++
 kernel/sched/sched.h |  2 +-
 3 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/init/Kconfig b/init/Kconfig
index 45eda8b28af869..a238ac05ea7b70 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -202,6 +202,13 @@ config ZEN_INTERACTIVE
 
 	    Background-reclaim hugepages...:   no   ->   yes
 
+	  --- EEVDF CPU Scheduler --------------------------------
+
+	    Minimal granularity............:   0.7  ->   0.4  ms
+	    Migration cost.................:   0.5  ->   0.3  ms
+	    Bandwidth slice size...........:   5    ->   3    ms
+	    Task rebalancing threshold.....:  32    ->   8
+
 config BROKEN
 	bool
 	help
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5b752324270b08..a31912c241cf10 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -76,10 +76,19 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
  *
  * (default: 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds)
  */
+#ifdef CONFIG_ZEN_INTERACTIVE
+unsigned int sysctl_sched_base_slice			= 400000ULL;
+static unsigned int normalized_sysctl_sched_base_slice	= 400000ULL;
+#else
 unsigned int sysctl_sched_base_slice			= 700000ULL;
 static unsigned int normalized_sysctl_sched_base_slice	= 700000ULL;
+#endif
 
+#ifdef CONFIG_ZEN_INTERACTIVE
+__read_mostly unsigned int sysctl_sched_migration_cost	= 300000UL;
+#else
 __read_mostly unsigned int sysctl_sched_migration_cost	= 500000UL;
+#endif
 
 static int __init setup_sched_thermal_decay_shift(char *str)
 {
@@ -122,8 +131,12 @@ int __weak arch_asym_cpu_priority(int cpu)
  *
  * (default: 5 msec, units: microseconds)
  */
+#ifdef CONFIG_ZEN_INTERACTIVE
+static unsigned int sysctl_sched_cfs_bandwidth_slice		= 3000UL;
+#else
 static unsigned int sysctl_sched_cfs_bandwidth_slice		= 5000UL;
 #endif
+#endif
 
 #ifdef CONFIG_NUMA_BALANCING
 /* Restrict the NUMA promotion throughput (MB/s) for each target node. */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index adfb6e3409d729..a5addab8f3f223 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2800,7 +2800,7 @@ extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
 
 extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags);
 
-#ifdef CONFIG_PREEMPT_RT
+#if defined(CONFIG_PREEMPT_RT) || defined(CONFIG_ZEN_INTERACTIVE)
 # define SCHED_NR_MIGRATE_BREAK 8
 #else
 # define SCHED_NR_MIGRATE_BREAK 32

From 36bf8a0dac5458dd1b74ff2e801bc41da900d753 Mon Sep 17 00:00:00 2001
From: "Jan Alexander Steffens (heftig)" <jan.steffens@gmail.com>
Date: Mon, 27 Jan 2020 18:27:16 +0100
Subject: [PATCH 15/26] ZEN: INTERACTIVE: Tune ondemand governor for
 interactivity

4.10:
During some personal testing with the Dolphin emulator, MuQSS has
serious problems scaling its frequencies causing poor performance where
boosting the CPU frequencies would have fixed them.  Reducing the
up_threshold to 45 with MuQSS appears to fix the issue, letting the
introduction to "Star Wars: Rogue Leader" run at 100% speed versus about
80% on my test system.

Also, lets refactor the definitions and include some indentation to help
the reader discern what the scope of all the macros are.

5.4:
On the last custom kernel benchmark from Phoronix with Xanmod, Michael
configured all the kernels to run using ondemand instead of the kernel's
[default selection][1].  This reminded me that another option outside of
the kernels control is the user's choice to change the cpufreq governor,
for better or for worse.

In Liquorix, performance is the default governor whether you're running
acpi-cpufreq or intel-pstate.  I expect laptop users to install TLP or
LMT to control the power balance on their system, especially when
they're plugged in or on battery.  However, it's pretty clear to me a
lot of people would choose ondemand over performance since it's not
obvious it has huge performance ramifications with MuQSS, and ondemand
otherwise is "good enough" for most people.

Lets codify lower up thresholds for MuQSS to more closely synergize with
its aggressive thread migration behavior.  This way when ondemand is
configured, you get sort of a "performance-lite" type of result but with
the power savings you expect when leaving the running system idle.

[1]: https://www.phoronix.com/scan.php?page=article&item=xanmod-2020-kernel

5.14:
Although CFS and similar schedulers (BMQ, PDS, and CacULE), reuse a lot
more of mainline scheduling and do a good job of pinning single threaded
tasks to their respective core, there's still applications that
confusingly run steady near 50% and benefit from going full speed or
turbo when they need to run (emulators for more recent consoles come to
mind).

Drop the up threshold for all non-MuQSS schedulers from 80/95 to 55/60.

5.15:
Remove MuQSS cpufreq configuration.

Signed-off-by: Kai Krakow <kai@kaishome.de>
---
 drivers/cpufreq/cpufreq_ondemand.c | 8 +++++++-
 init/Kconfig                       | 6 ++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index a6ecc203f7b7f3..46ea23cbb75437 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -18,10 +18,16 @@
 #include "cpufreq_ondemand.h"
 
 /* On-demand governor macros */
+#if defined(CONFIG_ZEN_INTERACTIVE)
+#define DEF_FREQUENCY_UP_THRESHOLD		(55)
+#define MICRO_FREQUENCY_UP_THRESHOLD		(60)
+#define DEF_SAMPLING_DOWN_FACTOR		(5)
+#else
 #define DEF_FREQUENCY_UP_THRESHOLD		(80)
+#define MICRO_FREQUENCY_UP_THRESHOLD		(95)
 #define DEF_SAMPLING_DOWN_FACTOR		(1)
+#endif
 #define MAX_SAMPLING_DOWN_FACTOR		(100000)
-#define MICRO_FREQUENCY_UP_THRESHOLD		(95)
 #define MIN_FREQUENCY_UP_THRESHOLD		(1)
 #define MAX_FREQUENCY_UP_THRESHOLD		(100)
 
diff --git a/init/Kconfig b/init/Kconfig
index a238ac05ea7b70..facb36dc2656c2 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -209,6 +209,12 @@ config ZEN_INTERACTIVE
 	    Bandwidth slice size...........:   5    ->   3    ms
 	    Task rebalancing threshold.....:  32    ->   8
 
+	  --- CPUFreq Settings -----------------------------------
+
+	    Ondemand sampling down factor..:   1    ->   5
+	    Ondemand default up threshold..:  80    ->  55
+	    Ondemand micro up threshold....:  95    ->  60
+
 config BROKEN
 	bool
 	help

From cc2af159d1abdf6781042ebe37d8a29a3dc571f2 Mon Sep 17 00:00:00 2001
From: Steven Barrett <steven@liquorix.net>
Date: Sat, 5 Mar 2022 11:37:14 -0600
Subject: [PATCH 16/26] ZEN: INTERACTIVE: mm: Disable unevictable compaction

This option is already disabled when CONFIG_PREEMPT_RT is enabled, lets
turn it off when CONFIG_ZEN_INTERACTIVE is set as well.

Signed-off-by: Kai Krakow <kai@kaishome.de>
---
 init/Kconfig | 1 +
 mm/Kconfig   | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/init/Kconfig b/init/Kconfig
index facb36dc2656c2..ce7a58f97abf14 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -201,6 +201,7 @@ config ZEN_INTERACTIVE
 	  --- Virtual Memory Subsystem ---------------------------
 
 	    Background-reclaim hugepages...:   no   ->   yes
+	    Compact unevictable............:   yes  ->   no
 
 	  --- EEVDF CPU Scheduler --------------------------------
 
diff --git a/mm/Kconfig b/mm/Kconfig
index ca3f146bc7053a..de643ce40c1108 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -631,7 +631,7 @@ config COMPACTION
 config COMPACT_UNEVICTABLE_DEFAULT
 	int
 	depends on COMPACTION
-	default 0 if PREEMPT_RT
+	default 0 if PREEMPT_RT || ZEN_INTERACTIVE
 	default 1
 
 #

From a06be6e2d420830e71cf5067117edc433e6210bd Mon Sep 17 00:00:00 2001
From: Sultan Alsawaf <sultan@kerneltoast.com>
Date: Sat, 28 Mar 2020 13:06:28 -0700
Subject: [PATCH 17/26] ZEN: INTERACTIVE: mm: Disable watermark boosting by
 default

What watermark boosting does is preemptively fire up kswapd to free
memory when there hasn't been an allocation failure. It does this by
increasing kswapd's high watermark goal and then firing up kswapd. The
reason why this causes freezes is because, with the increased high
watermark goal, kswapd will steal memory from processes that need it in
order to make forward progress. These processes will, in turn, try to
allocate memory again, which will cause kswapd to steal necessary pages
from those processes again, in a positive feedback loop known as page
thrashing. When page thrashing occurs, your system is essentially
livelocked until the necessary forward progress can be made to stop
processes from trying to continuously allocate memory and trigger
kswapd to steal it back.

This problem already occurs with kswapd *without* watermark boosting,
but it's usually only encountered on machines with a small amount of
memory and/or a slow CPU. Watermark boosting just makes the existing
problem worse enough to notice on higher spec'd machines.

Disable watermark boosting by default since it's a total dumpster fire.
I can't imagine why anyone would want to explicitly enable it, but the
option is there in case someone does.

Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>
---
 init/Kconfig    | 1 +
 mm/page_alloc.c | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/init/Kconfig b/init/Kconfig
index ce7a58f97abf14..8db0e659f2bee6 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -202,6 +202,7 @@ config ZEN_INTERACTIVE
 
 	    Background-reclaim hugepages...:   no   ->   yes
 	    Compact unevictable............:   yes  ->   no
+	    Watermark boost factor.........:   1.5  ->   0
 
 	  --- EEVDF CPU Scheduler --------------------------------
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 677f9bb0428b71..9407f8a22b06f3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -276,7 +276,11 @@ const char * const migratetype_names[MIGRATE_TYPES] = {
 
 int min_free_kbytes = 1024;
 int user_min_free_kbytes = -1;
+#ifdef CONFIG_ZEN_INTERACTIVE
+static int watermark_boost_factor __read_mostly;
+#else
 static int watermark_boost_factor __read_mostly = 15000;
+#endif
 static int watermark_scale_factor = 10;
 int defrag_mode;
 

From 11b8bf5d968ea310d53a8381dd8f01b20448d465 Mon Sep 17 00:00:00 2001
From: Steven Barrett <steven@liquorix.net>
Date: Mon, 5 Sep 2022 11:35:20 -0500
Subject: [PATCH 18/26] ZEN: INTERACTIVE: mm/swap: Disable swap-in readahead

Per an [issue][1] on the chromium project, swap-in readahead causes more
jank than not.  This might be caused by poor optimization on the
swapping code, or the fact under memory pressure, we're pulling in pages
we don't need, causing more swapping.

Either way, this is mainline/upstream to Chromium, and ChromeOS
developers care a lot about system responsiveness. Lets implement the
same change so Zen Kernel users benefit.

[1]: https://bugs.chromium.org/p/chromium/issues/detail?id=263561

Signed-off-by: Kai Krakow <kai@kaishome.de>
---
 init/Kconfig | 1 +
 mm/swap.c    | 5 +++++
 2 files changed, 6 insertions(+)

diff --git a/init/Kconfig b/init/Kconfig
index 8db0e659f2bee6..3b696d2d7e2b47 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -203,6 +203,7 @@ config ZEN_INTERACTIVE
 	    Background-reclaim hugepages...:   no   ->   yes
 	    Compact unevictable............:   yes  ->   no
 	    Watermark boost factor.........:   1.5  ->   0
+	    Swap-in readahead..............:   3    ->   0
 
 	  --- EEVDF CPU Scheduler --------------------------------
 
diff --git a/mm/swap.c b/mm/swap.c
index 2260dcd2775e75..3a7aefc524e56a 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1101,6 +1101,10 @@ static const struct ctl_table swap_sysctl_table[] = {
  */
 void __init swap_setup(void)
 {
+#ifdef CONFIG_ZEN_INTERACTIVE
+	/* Only swap-in pages requested, avoid readahead */
+	page_cluster = 0;
+#else
 	unsigned long megs = PAGES_TO_MB(totalram_pages());
 
 	/* Use a smaller cluster for small-memory machines */
@@ -1112,6 +1116,7 @@ void __init swap_setup(void)
 	 * Right now other parts of the system means that we
 	 * _really_ don't want to cluster much more
 	 */
+#endif
 
 	register_sysctl_init("vm", swap_sysctl_table);
 }

From fbb84338bf74b42095ef9671c86a12c3e9b75c6a Mon Sep 17 00:00:00 2001
From: Piotr Gorski <lucjan.lucjanov@gmail.com>
Date: Fri, 28 Nov 2025 16:50:14 +0100
Subject: [PATCH 19/26] BORE scheduler 6.18 from CachyOS

Partially overrides ZEN interactive adjustments if enabled.

Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com>
Link: https://aur.archlinux.org/packages/linux-cachyos-bore
---
 include/linux/sched.h      |  29 +++
 include/linux/sched/bore.h |  39 ++++
 init/Kconfig               |  17 ++
 kernel/Kconfig.hz          |  17 ++
 kernel/fork.c              |   8 +
 kernel/futex/waitwake.c    |  11 ++
 kernel/sched/Makefile      |   1 +
 kernel/sched/bore.c        | 387 +++++++++++++++++++++++++++++++++++++
 kernel/sched/core.c        |  12 ++
 kernel/sched/debug.c       |  61 ++++++
 kernel/sched/fair.c        | 136 +++++++++++--
 kernel/sched/features.h    |   3 +
 kernel/sched/sched.h       |   9 +
 13 files changed, 712 insertions(+), 18 deletions(-)
 create mode 100644 include/linux/sched/bore.h
 create mode 100644 kernel/sched/bore.c

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b469878de25c8a..5809513cf54b7f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -816,6 +816,32 @@ struct kmap_ctrl {
 #endif
 };
 
+#ifdef CONFIG_SCHED_BORE
+#define BORE_BC_TIMESTAMP_SHIFT 16
+
+struct bore_bc {
+	u64				timestamp:	48;
+	u64				penalty:	16;
+};
+
+struct bore_ctx {
+	struct bore_bc	subtree;
+	struct bore_bc	group;
+	u64				burst_time;
+	u16				prev_penalty;
+	u16				curr_penalty;
+	union {
+		u16			penalty;
+		struct {
+			u8		_;
+			u8		score;
+		};
+	};
+	bool			stop_update;
+	bool			futex_waiting;
+};
+#endif /* CONFIG_SCHED_BORE */
+
 struct task_struct {
 #ifdef CONFIG_THREAD_INFO_IN_TASK
 	/*
@@ -874,6 +900,9 @@ struct task_struct {
 #ifdef CONFIG_SCHED_CLASS_EXT
 	struct sched_ext_entity		scx;
 #endif
+#ifdef CONFIG_SCHED_BORE
+	struct bore_ctx			bore;
+#endif /* CONFIG_SCHED_BORE */
 	const struct sched_class	*sched_class;
 
 #ifdef CONFIG_SCHED_CORE
diff --git a/include/linux/sched/bore.h b/include/linux/sched/bore.h
new file mode 100644
index 00000000000000..79dd72b9aa38ff
--- /dev/null
+++ b/include/linux/sched/bore.h
@@ -0,0 +1,39 @@
+#ifndef _KERNEL_SCHED_BORE_H
+#define _KERNEL_SCHED_BORE_H
+
+#include <linux/sched.h>
+#include <linux/sched/cputime.h>
+#include <linux/atomic.h>
+#include <linux/list.h>
+#include <linux/rcupdate.h>
+
+#define SCHED_BORE_AUTHOR   "Masahito Suzuki"
+#define SCHED_BORE_PROGNAME "BORE CPU Scheduler modification"
+
+#define SCHED_BORE_VERSION  "6.5.9"
+
+extern u8   __read_mostly sched_bore;
+extern u8   __read_mostly sched_burst_inherit_type;
+extern u8   __read_mostly sched_burst_smoothness;
+extern u8   __read_mostly sched_burst_penalty_offset;
+extern uint __read_mostly sched_burst_penalty_scale;
+extern uint __read_mostly sched_burst_cache_lifetime;
+
+extern u8   effective_prio_bore(struct task_struct *p);
+extern void update_curr_bore(struct task_struct *p, u64 delta_exec);
+extern void restart_burst_bore(struct task_struct *p);
+extern void restart_burst_rescale_deadline_bore(struct task_struct *p);
+extern void task_fork_bore(struct task_struct *p, struct task_struct *parent,
+													u64 clone_flags, u64 now);
+extern void sched_init_bore(void);
+extern void reset_task_bore(struct task_struct *p);
+
+extern int  sched_bore_update_handler(const struct ctl_table *table,
+	int write, void __user *buffer, size_t *lenp, loff_t *ppos);
+extern int  sched_burst_inherit_type_update_handler(const struct ctl_table *table,
+	int write, void __user *buffer, size_t *lenp, loff_t *ppos);
+
+extern void reweight_entity(
+	struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight);
+
+#endif /* _KERNEL_SCHED_BORE_H */
diff --git a/init/Kconfig b/init/Kconfig
index 3b696d2d7e2b47..f93c898c20f886 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1454,6 +1454,23 @@ config CHECKPOINT_RESTORE
 
 	  If unsure, say N here.
 
+config SCHED_BORE
+	bool "Burst-Oriented Response Enhancer"
+	default y
+	help
+	  In Desktop and Mobile computing, one might prefer interactive
+	  tasks to keep responsive no matter what they run in the background.
+
+	  Enabling this kernel feature modifies the scheduler to discriminate
+	  tasks by their burst time (runtime since it last went sleeping or
+	  yielding state) and prioritize those that run less bursty.
+	  Such tasks usually include window compositor, widgets backend,
+	  terminal emulator, video playback, games and so on.
+	  With a little impact to scheduling fairness, it may improve
+	  responsiveness especially under heavy background workload.
+
+	  If unsure, say Y here.
+
 config SCHED_AUTOGROUP
 	bool "Automatic process group scheduling"
 	select CGROUPS
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index ce1435cb08b1ec..9eee2005e25f07 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -57,3 +57,20 @@ config HZ
 
 config SCHED_HRTICK
 	def_bool HIGH_RES_TIMERS
+
+config MIN_BASE_SLICE_NS
+	int "Default value for min_base_slice_ns"
+	default 2000000
+	help
+	 The BORE Scheduler automatically calculates the optimal base
+	 slice for the configured HZ using the following equation:
+	 
+	 base_slice_ns =
+	 	1000000000/HZ * DIV_ROUNDUP(min_base_slice_ns, 1000000000/HZ)
+	 
+	 This option sets the default lower bound limit of the base slice
+	 to prevent the loss of task throughput due to overscheduling.
+	 
+	 Setting this value too high can cause the system to boot with
+	 an unnecessarily large base slice, resulting in high scheduling
+	 latency and poor system responsiveness.
diff --git a/kernel/fork.c b/kernel/fork.c
index 3da0f08615a95e..a166224b5f8638 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -116,6 +116,10 @@
 /* For dup_mmap(). */
 #include "../mm/internal.h"
 
+#ifdef CONFIG_SCHED_BORE
+#include <linux/sched/bore.h>
+#endif /* CONFIG_SCHED_BORE */
+
 #include <trace/events/sched.h>
 
 #define CREATE_TRACE_POINTS
@@ -2325,6 +2329,10 @@ __latent_entropy struct task_struct *copy_process(
 	 * Need tasklist lock for parent etc handling!
 	 */
 	write_lock_irq(&tasklist_lock);
+#ifdef CONFIG_SCHED_BORE
+	if (likely(p->pid))
+		task_fork_bore(p, current, clone_flags, p->start_time);
+#endif /* CONFIG_SCHED_BORE */
 
 	/* CLONE_PARENT re-uses the old parent */
 	if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c
index e2bbe5509ec27a..6484ad583f3bf7 100644
--- a/kernel/futex/waitwake.c
+++ b/kernel/futex/waitwake.c
@@ -4,6 +4,9 @@
 #include <linux/sched/task.h>
 #include <linux/sched/signal.h>
 #include <linux/freezer.h>
+#ifdef CONFIG_SCHED_BORE
+#include <linux/sched/bore.h>
+#endif /* CONFIG_SCHED_BORE */
 
 #include "futex.h"
 
@@ -355,7 +358,15 @@ void futex_do_wait(struct futex_q *q, struct hrtimer_sleeper *timeout)
 		 * is no timeout, or if it has yet to expire.
 		 */
 		if (!timeout || timeout->task)
+#ifdef CONFIG_SCHED_BORE
+		{
+			current->bore.futex_waiting = true;
+#endif /* CONFIG_SCHED_BORE */
 			schedule();
+#ifdef CONFIG_SCHED_BORE
+			current->bore.futex_waiting = false;
+		}
+#endif /* CONFIG_SCHED_BORE */
 	}
 	__set_current_state(TASK_RUNNING);
 }
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 8ae86371ddcddf..b688084bcecc75 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -37,3 +37,4 @@ obj-y += core.o
 obj-y += fair.o
 obj-y += build_policy.o
 obj-y += build_utility.o
+obj-$(CONFIG_SCHED_BORE) += bore.o
diff --git a/kernel/sched/bore.c b/kernel/sched/bore.c
new file mode 100644
index 00000000000000..7c1baa1bdf6c56
--- /dev/null
+++ b/kernel/sched/bore.c
@@ -0,0 +1,387 @@
+/*
+ *  Burst-Oriented Response Enhancer (BORE) CPU Scheduler
+ *  Copyright (C) 2021-2025 Masahito Suzuki <firelzrd@gmail.com>
+ */
+#include <linux/cpuset.h>
+#include <linux/sched/task.h>
+#include <linux/sched/bore.h>
+#include "sched.h"
+
+#ifdef CONFIG_SCHED_BORE
+u8   __read_mostly sched_bore                   = 1;
+u8   __read_mostly sched_burst_inherit_type     = 2;
+u8   __read_mostly sched_burst_smoothness       = 1;
+u8   __read_mostly sched_burst_penalty_offset   = 24;
+uint __read_mostly sched_burst_penalty_scale    = 1536;
+uint __read_mostly sched_burst_cache_lifetime   = 75000000;
+static int __maybe_unused maxval_prio    =   39;
+static int __maybe_unused maxval_6_bits  =   63;
+static int __maybe_unused maxval_8_bits  =  255;
+static int __maybe_unused maxval_12_bits = 4095;
+
+#define MAX_BURST_PENALTY ((40U << 8) - 1)
+#define BURST_CACHE_STOP_COUNT 63
+
+static u32 (*inherit_penalty_fn)(struct task_struct *, u64, u64);
+
+static inline u32 log2p1_u64_u32fp(u64 v, u8 fp) {
+	if (!v) return 0;
+	u32 exponent = fls64(v),
+		mantissa = (u32)(v << (64 - exponent) << 1 >> (64 - fp));
+	return exponent << fp | mantissa;
+}
+
+static inline u32 calc_burst_penalty(u64 burst_time) {
+	u32 greed = log2p1_u64_u32fp(burst_time, 8),
+		tolerance = sched_burst_penalty_offset << 8,
+		penalty = max(0, (s32)(greed - tolerance)),
+		scaled_penalty = penalty * sched_burst_penalty_scale >> 10;
+	return min(MAX_BURST_PENALTY, scaled_penalty);
+}
+
+static inline u64 rescale_slice(u64 delta, u8 old_prio, u8 new_prio) {
+	u64 unscaled, rescaled;
+	unscaled = mul_u64_u32_shr(delta   , sched_prio_to_weight[old_prio], 10);
+	rescaled = mul_u64_u32_shr(unscaled, sched_prio_to_wmult [new_prio], 22);
+	return rescaled;
+}
+
+static inline u32 binary_smooth(u32 new, u32 old) {
+	if (new <= old) return new;
+
+	u32 increment = new - old,
+		shift = sched_burst_smoothness,
+		divisor = 1U << shift;
+
+	return old + ((increment + divisor - 1) >> shift);
+}
+
+static void reweight_task_by_prio(struct task_struct *p, int prio) {
+	if (task_has_idle_policy(p)) return;
+
+	struct sched_entity *se = &p->se;
+	unsigned long weight = scale_load(sched_prio_to_weight[prio]);
+
+	if (se->on_rq) {
+		p->bore.stop_update = true;
+		reweight_entity(cfs_rq_of(se), se, weight);
+		p->bore.stop_update = false;
+	} else
+		se->load.weight = weight;
+	se->load.inv_weight = sched_prio_to_wmult[prio];
+}
+
+u8 effective_prio_bore(struct task_struct *p) {
+	int prio = p->static_prio - MAX_RT_PRIO;
+	if (likely(sched_bore))
+		prio += p->bore.score;
+	return (u8)clamp(prio, 0, maxval_prio);
+}
+
+static void update_penalty(struct task_struct *p) {
+	struct bore_ctx *ctx = &p->bore;
+
+	u8  prev_prio = effective_prio_bore(p);
+
+	ctx->penalty = (p->flags & PF_KTHREAD)? 0:
+		max(ctx->curr_penalty, ctx->prev_penalty);
+
+	u8 new_prio = effective_prio_bore(p);
+	if (new_prio != prev_prio)
+		reweight_task_by_prio(p, new_prio);
+}
+
+void update_curr_bore(struct task_struct *p, u64 delta_exec) {
+	struct bore_ctx *ctx = &p->bore;
+	if (ctx->stop_update) return;
+
+	ctx->burst_time += delta_exec;
+	u32 curr_penalty = ctx->curr_penalty = calc_burst_penalty(ctx->burst_time);
+
+	if (curr_penalty <= ctx->prev_penalty) return;
+	update_penalty(p);
+}
+
+void restart_burst_bore(struct task_struct *p) {
+	struct bore_ctx *ctx = &p->bore;
+	u32 new_penalty = binary_smooth(ctx->curr_penalty, ctx->prev_penalty);
+	ctx->prev_penalty = new_penalty;
+	ctx->curr_penalty = 0;
+	ctx->burst_time = 0;
+	update_penalty(p);
+}
+
+void restart_burst_rescale_deadline_bore(struct task_struct *p) {
+	struct sched_entity *se = &p->se;
+	s64 vscaled, vremain = se->deadline - se->vruntime;
+
+	u8 old_prio = effective_prio_bore(p);
+	restart_burst_bore(p);
+	u8 new_prio = effective_prio_bore(p);
+
+	if (old_prio > new_prio) {
+		vscaled = rescale_slice(abs(vremain), old_prio, new_prio);
+		if (unlikely(vremain < 0))
+			vscaled = -vscaled;
+		se->deadline = se->vruntime + vscaled;
+	}
+}
+
+static inline bool task_is_bore_eligible(struct task_struct *p)
+{return p && p->sched_class == &fair_sched_class && !p->exit_state;}
+
+#ifndef for_each_child_task
+#define for_each_child_task(p, t) \
+	list_for_each_entry(t, &(p)->children, sibling)
+#endif
+
+static inline u32 count_children_upto2(struct task_struct *p) {
+	struct list_head *head = &p->children;
+	struct list_head *next = head->next;
+	return (next != head) + (next->next != head);
+}
+
+static inline bool burst_cache_expired(struct bore_bc *bc, u64 now) {
+	u64 timestamp = bc->timestamp << BORE_BC_TIMESTAMP_SHIFT;
+	return now - timestamp > sched_burst_cache_lifetime;
+}
+
+static void update_burst_cache(struct bore_bc *bc,
+		struct task_struct *p, u32 count, u32 total, u64 now) {
+	u32 average = count ? total / count : 0;
+	bc->penalty = max(average, p->bore.penalty);
+	bc->timestamp = now >> BORE_BC_TIMESTAMP_SHIFT;
+}
+
+static u32 inherit_none(struct task_struct *parent,
+									u64 clone_flags, u64 now)
+{ return 0; }
+
+static u32 inherit_from_parent(struct task_struct *parent,
+									u64 clone_flags, u64 now) {
+	if (clone_flags & CLONE_PARENT)
+		parent = parent->real_parent;
+
+	struct bore_bc *bc = &parent->bore.subtree;
+
+	if (burst_cache_expired(bc, now)) {
+		struct task_struct *child;
+		u32 count = 0, total = 0;
+		for_each_child_task(parent, child) {
+			if (count >= BURST_CACHE_STOP_COUNT) break;
+
+			if (!task_is_bore_eligible(child)) continue;
+			count++;
+			total += child->bore.penalty;
+		}
+
+		update_burst_cache(bc, parent, count, total, now);
+	}
+
+	return bc->penalty;
+}
+
+static u32 inherit_from_ancestor_hub(struct task_struct *parent,
+										u64 clone_flags, u64 now) {
+	struct task_struct *ancestor = parent;
+	u32 sole_child_count = 0;
+
+	if (clone_flags & CLONE_PARENT) {
+		ancestor = ancestor->real_parent;
+		sole_child_count = 1;
+	}
+
+	for (struct task_struct *next;
+			(next = ancestor->real_parent) != ancestor &&
+			count_children_upto2(ancestor) <= sole_child_count;
+			ancestor = next, sole_child_count = 1) {}
+
+	struct bore_bc *bc = &ancestor->bore.subtree;
+
+	if (burst_cache_expired(bc, now)) {
+		struct task_struct *direct_child;
+		u32 count = 0, total = 0;
+		for_each_child_task(ancestor, direct_child) {
+			if (count >= BURST_CACHE_STOP_COUNT) break;
+
+			struct task_struct *descendant = direct_child;
+			while (count_children_upto2(descendant) == 1)
+				descendant = list_first_entry(&descendant->children,
+												struct task_struct, sibling);
+
+			if (!task_is_bore_eligible(descendant)) continue;
+			count++;
+			total += descendant->bore.penalty;
+		}
+
+		update_burst_cache(bc, ancestor, count, total, now);
+	}
+
+	return bc->penalty;
+}
+
+static u32 inherit_from_thread_group(struct task_struct *p, u64 now) {
+	struct task_struct *leader = p->group_leader;
+	struct bore_bc *bc = &leader->bore.group;
+
+	if (burst_cache_expired(bc, now)) {
+		struct task_struct *sibling;
+		u32 count = 0, total = 0;
+
+		for_each_thread(leader, sibling) {
+			if (count >= BURST_CACHE_STOP_COUNT) break;
+
+			if (!task_is_bore_eligible(sibling)) continue;
+			count++;
+			total += sibling->bore.penalty;
+		}
+
+		update_burst_cache(bc, leader, count, total, now);
+	}
+
+	return bc->penalty;
+}
+
+void task_fork_bore(struct task_struct *p,
+	               struct task_struct *parent, u64 clone_flags, u64 now) {
+	if (!task_is_bore_eligible(p) || unlikely(!sched_bore)) return;
+
+	struct bore_ctx *ctx = &p->bore;
+	u32 inherited_penalty = (clone_flags & CLONE_THREAD)?
+		inherit_from_thread_group(parent, now):
+		inherit_penalty_fn(parent, clone_flags, now);
+
+	if (ctx->prev_penalty < inherited_penalty)
+		ctx->prev_penalty = inherited_penalty;
+	ctx->curr_penalty  = 0;
+	ctx->burst_time    = 0;
+	ctx->stop_update   = false;
+	ctx->futex_waiting = false;
+	update_penalty(p);
+}
+
+void reset_task_bore(struct task_struct *p)
+{ memset(&p->bore, 0, sizeof(struct bore_ctx)); }
+
+static void update_inherit_type(void) {
+	switch(sched_burst_inherit_type) {
+	case 1:
+		inherit_penalty_fn = inherit_from_parent;
+		break;
+	case 2:
+		inherit_penalty_fn = inherit_from_ancestor_hub;
+		break;
+	default:
+		inherit_penalty_fn = inherit_none;
+	}
+}
+
+void __init sched_init_bore(void) {
+	printk(KERN_INFO "%s %s by %s\n",
+		SCHED_BORE_PROGNAME, SCHED_BORE_VERSION, SCHED_BORE_AUTHOR);
+
+	reset_task_bore(&init_task);
+	update_inherit_type();
+}
+
+static void readjust_all_task_weights(void) {
+	struct task_struct *task;
+	struct rq *rq;
+	struct rq_flags rf;
+
+	scoped_guard(write_lock_irq, &tasklist_lock)
+	for_each_process(task) {
+		if (!task_is_bore_eligible(task)) continue;
+		rq = task_rq_lock(task, &rf);
+		update_rq_clock(rq);
+		reweight_task_by_prio(task, effective_prio_bore(task));
+		task_rq_unlock(rq, task, &rf);
+	}
+}
+
+int sched_bore_update_handler(const struct ctl_table *table,
+		int write, void __user *buffer, size_t *lenp, loff_t *ppos) {
+	int ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos);
+	if (ret || !write)
+		return ret;
+
+	readjust_all_task_weights();
+
+	return 0;
+}
+
+int sched_burst_inherit_type_update_handler(const struct ctl_table *table,
+		int write, void __user *buffer, size_t *lenp, loff_t *ppos) {
+	int ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos);
+	if (ret || !write)
+		return ret;
+
+	update_inherit_type();
+
+	return 0;
+}
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table sched_bore_sysctls[] = {
+	{
+		.procname	= "sched_bore",
+		.data		= &sched_bore,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler = sched_bore_update_handler,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
+	{
+		.procname	= "sched_burst_inherit_type",
+		.data		= &sched_burst_inherit_type,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler = sched_burst_inherit_type_update_handler,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_TWO,
+	},
+	{
+		.procname	= "sched_burst_smoothness",
+		.data		= &sched_burst_smoothness,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler = proc_dou8vec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_THREE,
+	},
+	{
+		.procname	= "sched_burst_penalty_offset",
+		.data		= &sched_burst_penalty_offset,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler = proc_dou8vec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= &maxval_6_bits,
+	},
+	{
+		.procname	= "sched_burst_penalty_scale",
+		.data		= &sched_burst_penalty_scale,
+		.maxlen		= sizeof(uint),
+		.mode		= 0644,
+		.proc_handler = proc_douintvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= &maxval_12_bits,
+	},
+	{
+		.procname	= "sched_burst_cache_lifetime",
+		.data		= &sched_burst_cache_lifetime,
+		.maxlen		= sizeof(uint),
+		.mode		= 0644,
+		.proc_handler = proc_douintvec,
+	},
+};
+
+static int __init sched_bore_sysctl_init(void) {
+	register_sysctl_init("kernel", sched_bore_sysctls);
+	return 0;
+}
+late_initcall(sched_bore_sysctl_init);
+
+#endif // CONFIG_SYSCTL
+#endif /* CONFIG_SCHED_BORE */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f754a60de84849..c5a6f727fef143 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -100,6 +100,10 @@
 #include "../smpboot.h"
 #include "../locking/mutex.h"
 
+#ifdef CONFIG_SCHED_BORE
+#include <linux/sched/bore.h>
+#endif /* CONFIG_SCHED_BORE */
+
 EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpu);
 EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpumask);
 
@@ -1431,7 +1435,11 @@ int tg_nop(struct task_group *tg, void *data)
 
 void set_load_weight(struct task_struct *p, bool update_load)
 {
+#ifdef CONFIG_SCHED_BORE
+	int prio = effective_prio_bore(p);
+#else /* !CONFIG_SCHED_BORE */
 	int prio = p->static_prio - MAX_RT_PRIO;
+#endif /* CONFIG_SCHED_BORE */
 	struct load_weight lw;
 
 	if (task_has_idle_policy(p)) {
@@ -8655,6 +8663,10 @@ void __init sched_init(void)
 	BUG_ON(!sched_class_above(&ext_sched_class, &idle_sched_class));
 #endif
 
+#ifdef CONFIG_SCHED_BORE
+	sched_init_bore();
+#endif /* CONFIG_SCHED_BORE */
+
 	wait_bit_init();
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 02e16b70a7901e..751df396d94ba6 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -169,6 +169,53 @@ static const struct file_operations sched_feat_fops = {
 	.release	= single_release,
 };
 
+#ifdef CONFIG_SCHED_BORE
+#define DEFINE_SYSCTL_SCHED_FUNC(name, update_func) \
+static ssize_t sched_##name##_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) \
+{ \
+	char buf[16]; \
+	unsigned int value; \
+\
+	if (cnt > 15) \
+		cnt = 15; \
+\
+	if (copy_from_user(&buf, ubuf, cnt)) \
+		return -EFAULT; \
+	buf[cnt] = '\0'; \
+\
+	if (kstrtouint(buf, 10, &value)) \
+		return -EINVAL; \
+\
+	sysctl_sched_##name = value; \
+	sched_update_##update_func(); \
+\
+	*ppos += cnt; \
+	return cnt; \
+} \
+\
+static int sched_##name##_show(struct seq_file *m, void *v) \
+{ \
+	seq_printf(m, "%d\n", sysctl_sched_##name); \
+	return 0; \
+} \
+\
+static int sched_##name##_open(struct inode *inode, struct file *filp) \
+{ \
+	return single_open(filp, sched_##name##_show, NULL); \
+} \
+\
+static const struct file_operations sched_##name##_fops = { \
+	.open		= sched_##name##_open, \
+	.write		= sched_##name##_write, \
+	.read		= seq_read, \
+	.llseek		= seq_lseek, \
+	.release	= single_release, \
+};
+
+DEFINE_SYSCTL_SCHED_FUNC(min_base_slice, min_base_slice)
+
+#undef DEFINE_SYSCTL_SCHED_FUNC
+#else /* !CONFIG_SCHED_BORE */
 static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf,
 				   size_t cnt, loff_t *ppos)
 {
@@ -214,6 +261,7 @@ static const struct file_operations sched_scaling_fops = {
 	.llseek		= seq_lseek,
 	.release	= single_release,
 };
+#endif /* CONFIG_SCHED_BORE */
 
 #ifdef CONFIG_PREEMPT_DYNAMIC
 
@@ -500,12 +548,19 @@ static __init int sched_init_debug(void)
 	debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops);
 #endif
 
+#ifdef CONFIG_SCHED_BORE
+	debugfs_create_file("min_base_slice_ns", 0644, debugfs_sched, NULL, &sched_min_base_slice_fops);
+	debugfs_create_u32("base_slice_ns", 0444, debugfs_sched, &sysctl_sched_base_slice);
+#else /* !CONFIG_SCHED_BORE */
 	debugfs_create_u32("base_slice_ns", 0644, debugfs_sched, &sysctl_sched_base_slice);
+#endif /* CONFIG_SCHED_BORE */
 
 	debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms);
 	debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once);
 
+#if !defined(CONFIG_SCHED_BORE)
 	debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops);
+#endif /* CONFIG_SCHED_BORE */
 	debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost);
 	debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate);
 
@@ -747,6 +802,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 		SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)),
 		SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime)));
 
+#ifdef CONFIG_SCHED_BORE
+	SEQ_printf(m, " %2d", p->bore.score);
+#endif /* CONFIG_SCHED_BORE */
 #ifdef CONFIG_NUMA_BALANCING
 	SEQ_printf(m, "   %d      %d", task_node(p), task_numa_group_id(p));
 #endif
@@ -1224,6 +1282,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
 	__PS("nr_involuntary_switches", p->nivcsw);
 
 	P(se.load.weight);
+#ifdef CONFIG_SCHED_BORE
+	P(bore.score);
+#endif /* CONFIG_SCHED_BORE */
 	P(se.avg.load_sum);
 	P(se.avg.runnable_sum);
 	P(se.avg.util_sum);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a31912c241cf10..f2dd16ce5e73d0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -58,6 +58,10 @@
 #include "stats.h"
 #include "autogroup.h"
 
+#ifdef CONFIG_SCHED_BORE
+#include <linux/sched/bore.h>
+#endif /* CONFIG_SCHED_BORE */
+
 /*
  * The initial- and re-scaling of tunables is configurable
  *
@@ -67,28 +71,41 @@
  *   SCHED_TUNABLESCALING_LOG - scaled logarithmically, *1+ilog(ncpus)
  *   SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
  *
- * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
+ * BORE : default SCHED_TUNABLESCALING_NONE = *1 constant
+ * EEVDF: default SCHED_TUNABLESCALING_LOG  = *(1+ilog(ncpus))
  */
+#ifdef CONFIG_SCHED_BORE
+unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
+#else /* !CONFIG_SCHED_BORE */
 unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
+#endif /* CONFIG_SCHED_BORE */
 
 /*
  * Minimal preemption granularity for CPU-bound tasks:
  *
+ * BORE : base_slice = minimum multiple of nsecs_per_tick >= min_base_slice
  * (default: 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds)
  */
-#ifdef CONFIG_ZEN_INTERACTIVE
+#ifdef CONFIG_SCHED_BORE
+
+static const unsigned int nsecs_per_tick                = 1000000000ULL / HZ;
+unsigned int sysctl_sched_min_base_slice                = CONFIG_MIN_BASE_SLICE_NS;
+__read_mostly uint sysctl_sched_base_slice              = nsecs_per_tick;
+__read_mostly unsigned int sysctl_sched_migration_cost	= 500000UL;
+
+#elifdef CONFIG_ZEN_INTERACTIVE
+
 unsigned int sysctl_sched_base_slice			= 400000ULL;
 static unsigned int normalized_sysctl_sched_base_slice	= 400000ULL;
+__read_mostly unsigned int sysctl_sched_migration_cost	= 300000UL;
+
 #else
+
 unsigned int sysctl_sched_base_slice			= 700000ULL;
 static unsigned int normalized_sysctl_sched_base_slice	= 700000ULL;
-#endif
-
-#ifdef CONFIG_ZEN_INTERACTIVE
-__read_mostly unsigned int sysctl_sched_migration_cost	= 300000UL;
-#else
 __read_mostly unsigned int sysctl_sched_migration_cost	= 500000UL;
-#endif
+
+#endif /* CONFIG_SCHED_BORE */
 
 static int __init setup_sched_thermal_decay_shift(char *str)
 {
@@ -202,6 +219,13 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w)
  *
  * This idea comes from the SD scheduler of Con Kolivas:
  */
+#ifdef CONFIG_SCHED_BORE
+static void update_sysctl(void) {
+	sysctl_sched_base_slice = nsecs_per_tick *
+		max(1UL, DIV_ROUND_UP(sysctl_sched_min_base_slice, nsecs_per_tick));
+}
+void sched_update_min_base_slice(void) { update_sysctl(); }
+#else /* !CONFIG_SCHED_BORE */
 static unsigned int get_update_sysctl_factor(void)
 {
 	unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
@@ -232,6 +256,7 @@ static void update_sysctl(void)
 	SET_SYSCTL(sched_base_slice);
 #undef SET_SYSCTL
 }
+#endif /* CONFIG_SCHED_BORE */
 
 void __init sched_init_granularity(void)
 {
@@ -711,6 +736,9 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 	vlag = avg_vruntime(cfs_rq) - se->vruntime;
 	limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
+#ifdef CONFIG_SCHED_BORE
+	limit >>= !!sched_bore;
+#endif /* CONFIG_SCHED_BORE */
 
 	se->vlag = clamp(vlag, -limit, limit);
 }
@@ -904,10 +932,17 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
  */
 static inline void set_protect_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
+#ifdef CONFIG_SCHED_BORE
+	u64 slice = sysctl_sched_base_slice;
+	bool run_to_parity = likely(sched_bore) ?
+		sched_feat(RUN_TO_PARITY_BORE) : sched_feat(RUN_TO_PARITY);
+#else /* CONFIG_SCHED_BORE */
 	u64 slice = normalized_sysctl_sched_base_slice;
+	bool run_to_parity = sched_feat(RUN_TO_PARITY);
+#endif /* CONFIG_SCHED_BORE */
 	u64 vprot = se->deadline;
 
-	if (sched_feat(RUN_TO_PARITY))
+	if (run_to_parity)
 		slice = cfs_rq_min_slice(cfs_rq);
 
 	slice = min(slice, se->slice);
@@ -972,6 +1007,11 @@ static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect)
 		curr = NULL;
 
 	if (curr && protect && protect_slice(curr))
+#ifdef CONFIG_SCHED_BORE
+		if (!entity_is_task(curr) ||
+			!task_of(curr)->bore.futex_waiting ||
+			unlikely(!sched_bore))
+#endif /* CONFIG_SCHED_BORE */
 		return curr;
 
 	/* Pick the leftmost entity if it's eligible */
@@ -1033,6 +1073,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 /**************************************************************
  * Scheduling class statistics methods:
  */
+#if !defined(CONFIG_SCHED_BORE)
 int sched_update_scaling(void)
 {
 	unsigned int factor = get_update_sysctl_factor();
@@ -1044,6 +1085,7 @@ int sched_update_scaling(void)
 
 	return 0;
 }
+#endif /* CONFIG_SCHED_BORE */
 
 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
 
@@ -1242,6 +1284,11 @@ static void update_curr(struct cfs_rq *cfs_rq)
 	update_min_vruntime(cfs_rq);
 
 	if (entity_is_task(curr)) {
+#ifdef CONFIG_SCHED_BORE
+		struct task_struct *p = task_of(curr);
+		update_curr_bore(p, delta_exec);
+#endif /* CONFIG_SCHED_BORE */
+
 		/*
 		 * If the fair_server is active, we need to account for the
 		 * fair_server time whether or not the task is running on
@@ -3780,7 +3827,7 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags);
 
-static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 			    unsigned long weight)
 {
 	bool curr = cfs_rq->curr == se;
@@ -5137,12 +5184,11 @@ void __setparam_fair(struct task_struct *p, const struct sched_attr *attr)
 static void
 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
-	u64 vslice, vruntime = avg_vruntime(cfs_rq);
+	u64 vslice = 0, vruntime = avg_vruntime(cfs_rq);
 	s64 lag = 0;
 
 	if (!se->custom_slice)
 		se->slice = sysctl_sched_base_slice;
-	vslice = calc_delta_fair(se->slice, se);
 
 	/*
 	 * Due to how V is constructed as the weighted average of entities,
@@ -5227,7 +5273,18 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 		se->rel_deadline = 0;
 		return;
 	}
-
+#ifdef CONFIG_SCHED_BORE
+	if (entity_is_task(se) &&
+			likely(sched_bore) &&
+			task_of(se)->bore.futex_waiting)
+		goto vslice_found;
+#endif /* !CONFIG_SCHED_BORE */
+	vslice = calc_delta_fair(se->slice, se);
+#ifdef CONFIG_SCHED_BORE
+	if (likely(sched_bore))
+		vslice >>= !!(flags & (ENQUEUE_INITIAL | ENQUEUE_WAKEUP));
+	else
+#endif /* CONFIG_SCHED_BORE */
 	/*
 	 * When joining the competition; the existing tasks will be,
 	 * on average, halfway through their slice, as such start tasks
@@ -5236,6 +5293,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL))
 		vslice /= 2;
 
+#ifdef CONFIG_SCHED_BORE
+vslice_found:
+#endif /* CONFIG_SCHED_BORE */
 	/*
 	 * EEVDF: vd_i = ve_i + r_i/w_i
 	 */
@@ -5246,7 +5306,7 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
 
 static void
-requeue_delayed_entity(struct sched_entity *se);
+requeue_delayed_entity(struct sched_entity *se, int flags);
 
 static void
 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
@@ -5404,6 +5464,10 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 		if (sched_feat(DELAY_DEQUEUE) && delay &&
 		    !entity_eligible(cfs_rq, se)) {
 			update_load_avg(cfs_rq, se, 0);
+#ifdef CONFIG_SCHED_BORE
+			if (sched_feat(DELAY_ZERO) && likely(sched_bore))
+				update_entity_lag(cfs_rq, se);
+#endif /* CONFIG_SCHED_BORE */
 			set_delayed(se);
 			return false;
 		}
@@ -6892,7 +6956,7 @@ static int sched_idle_cpu(int cpu)
 }
 
 static void
-requeue_delayed_entity(struct sched_entity *se)
+requeue_delayed_entity(struct sched_entity *se, int flags)
 {
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
@@ -6905,13 +6969,22 @@ requeue_delayed_entity(struct sched_entity *se)
 	WARN_ON_ONCE(!se->on_rq);
 
 	if (sched_feat(DELAY_ZERO)) {
+#ifdef CONFIG_SCHED_BORE
+		if (likely(sched_bore))
+			flags |= ENQUEUE_WAKEUP;
+		else {
+#endif /* CONFIG_SCHED_BORE */
+		flags = 0;
 		update_entity_lag(cfs_rq, se);
+#ifdef CONFIG_SCHED_BORE
+		}
+#endif /* CONFIG_SCHED_BORE */
 		if (se->vlag > 0) {
 			cfs_rq->nr_queued--;
 			if (se != cfs_rq->curr)
 				__dequeue_entity(cfs_rq, se);
 			se->vlag = 0;
-			place_entity(cfs_rq, se, 0);
+			place_entity(cfs_rq, se, flags);
 			if (se != cfs_rq->curr)
 				__enqueue_entity(cfs_rq, se);
 			cfs_rq->nr_queued++;
@@ -6951,7 +7024,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		util_est_enqueue(&rq->cfs, p);
 
 	if (flags & ENQUEUE_DELAYED) {
-		requeue_delayed_entity(se);
+		requeue_delayed_entity(se, flags);
 		return;
 	}
 
@@ -6969,7 +7042,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	for_each_sched_entity(se) {
 		if (se->on_rq) {
 			if (se->sched_delayed)
-				requeue_delayed_entity(se);
+				requeue_delayed_entity(se, flags);
 			break;
 		}
 		cfs_rq = cfs_rq_of(se);
@@ -7182,6 +7255,15 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		util_est_dequeue(&rq->cfs, p);
 
 	util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
+#ifdef CONFIG_SCHED_BORE
+	struct cfs_rq *cfs_rq = &rq->cfs;
+	struct sched_entity *se = &p->se;
+	if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
+		if (cfs_rq->curr == se)
+			update_curr(cfs_rq_of(&p->se));
+		restart_burst_bore(p);
+	}
+#endif /* CONFIG_SCHED_BORE */
 	if (dequeue_entities(rq, &p->se, flags) < 0)
 		return false;
 
@@ -8830,7 +8912,13 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
 	if (__pick_eevdf(cfs_rq, !do_preempt_short) == pse)
 		goto preempt;
 
+#ifdef CONFIG_SCHED_BORE
+	bool run_to_parity = likely(sched_bore) ?
+		sched_feat(RUN_TO_PARITY_BORE) : sched_feat(RUN_TO_PARITY);
+	if (run_to_parity && do_preempt_short)
+#else /* CONFIG_SCHED_BORE */
 	if (sched_feat(RUN_TO_PARITY) && do_preempt_short)
+#endif /* CONFIG_SCHED_BORE */
 		update_protect_slice(cfs_rq, se);
 
 	return;
@@ -9010,16 +9098,25 @@ static void yield_task_fair(struct rq *rq)
 	/*
 	 * Are we the only task in the tree?
 	 */
+#if !defined(CONFIG_SCHED_BORE)
 	if (unlikely(rq->nr_running == 1))
 		return;
 
 	clear_buddies(cfs_rq, se);
+#endif /* CONFIG_SCHED_BORE */
 
 	update_rq_clock(rq);
 	/*
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
+#ifdef CONFIG_SCHED_BORE
+	restart_burst_rescale_deadline_bore(curr);
+	if (unlikely(rq->nr_running == 1))
+		return;
+
+	clear_buddies(cfs_rq, se);
+#endif /* CONFIG_SCHED_BORE */
 	/*
 	 * Tell update_rq_clock() that we've just updated,
 	 * so we don't do microscopic update in schedule()
@@ -13269,6 +13366,9 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
 	WARN_ON_ONCE(p->se.sched_delayed);
 
 	attach_task_cfs_rq(p);
+#ifdef CONFIG_SCHED_BORE
+	reset_task_bore(p);
+#endif /* CONFIG_SCHED_BORE */
 
 	set_task_max_allowed_capacity(p);
 
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 3c12d9f93331d6..abadc5ca74e2dd 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -18,6 +18,9 @@ SCHED_FEAT(PLACE_REL_DEADLINE, true)
  * 0-lag point or until is has exhausted it's slice.
  */
 SCHED_FEAT(RUN_TO_PARITY, true)
+#ifdef CONFIG_SCHED_BORE
+SCHED_FEAT(RUN_TO_PARITY_BORE, false)
+#endif /* CONFIG_SCHED_BORE */
 /*
  * Allow wakeup of tasks with a shorter slice to cancel RUN_TO_PARITY for
  * current.
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a5addab8f3f223..39732d2daf80ea 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2130,7 +2130,11 @@ extern int group_balance_cpu(struct sched_group *sg);
 extern void update_sched_domain_debugfs(void);
 extern void dirty_sched_domain_sysctl(int cpu);
 
+#ifdef CONFIG_SCHED_BORE
+extern void sched_update_min_base_slice(void);
+#else /* !CONFIG_SCHED_BORE */
 extern int sched_update_scaling(void);
+#endif /* CONFIG_SCHED_BORE */
 
 static inline const struct cpumask *task_user_cpus(struct task_struct *p)
 {
@@ -2809,7 +2813,12 @@ extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags);
 extern __read_mostly unsigned int sysctl_sched_nr_migrate;
 extern __read_mostly unsigned int sysctl_sched_migration_cost;
 
+#ifdef CONFIG_SCHED_BORE
+extern unsigned int sysctl_sched_min_base_slice;
+extern __read_mostly uint sysctl_sched_base_slice;
+#else /* !CONFIG_SCHED_BORE */
 extern unsigned int sysctl_sched_base_slice;
+#endif /* CONFIG_SCHED_BORE */
 
 extern int sysctl_resched_latency_warn_ms;
 extern int sysctl_resched_latency_warn_once;

From ebe05f49165ccfe2f626bdfe8456fe238cf14277 Mon Sep 17 00:00:00 2001
From: Andrea Righi <arighi@nvidia.com>
Date: Fri, 11 Apr 2025 09:39:12 +0200
Subject: [PATCH 20/26] sched/fair: Prefer full-idle SMT cores

When selecting an idle CPU for a task, always try to prioritize
full-idle SMT cores (CPUs belonging to an SMT core where all its sibling
are idle) over partially-idle cores.

Signed-off-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/fair.c | 69 ++++++++++++++++++++-------------------------
 1 file changed, 31 insertions(+), 38 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f2dd16ce5e73d0..66526308dcaf98 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7631,9 +7631,14 @@ static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct tas
 	return new_cpu;
 }
 
+static inline bool is_idle_cpu(int cpu)
+{
+	return available_idle_cpu(cpu) || sched_idle_cpu(cpu);
+}
+
 static inline int __select_idle_cpu(int cpu, struct task_struct *p)
 {
-	if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) &&
+	if (is_idle_cpu(cpu) &&
 	    sched_cpu_cookie_match(cpu_rq(cpu), p))
 		return cpu;
 
@@ -7644,6 +7649,24 @@ static inline int __select_idle_cpu(int cpu, struct task_struct *p)
 DEFINE_STATIC_KEY_FALSE(sched_smt_present);
 EXPORT_SYMBOL_GPL(sched_smt_present);
 
+/*
+ * Return true if all the CPUs in the SMT core where @cpu belongs are idle,
+ * false otherwise.
+ */
+static bool is_idle_core(int cpu)
+{
+	int sibling;
+
+	if (!sched_smt_active())
+		return is_idle_cpu(cpu);
+
+	for_each_cpu(sibling, cpu_smt_mask(cpu))
+		if (!is_idle_cpu(sibling))
+			return false;
+
+	return true;
+}
+
 static inline void set_idle_cores(int cpu, int val)
 {
 	struct sched_domain_shared *sds;
@@ -7726,29 +7749,6 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
 	return -1;
 }
 
-/*
- * Scan the local SMT mask for idle CPUs.
- */
-static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
-{
-	int cpu;
-
-	for_each_cpu_and(cpu, cpu_smt_mask(target), p->cpus_ptr) {
-		if (cpu == target)
-			continue;
-		/*
-		 * Check if the CPU is in the LLC scheduling domain of @target.
-		 * Due to isolcpus, there is no guarantee that all the siblings are in the domain.
-		 */
-		if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
-			continue;
-		if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
-			return cpu;
-	}
-
-	return -1;
-}
-
 #else /* !CONFIG_SCHED_SMT: */
 
 static inline void set_idle_cores(int cpu, int val)
@@ -7765,9 +7765,9 @@ static inline int select_idle_core(struct task_struct *p, int core, struct cpuma
 	return __select_idle_cpu(core, p);
 }
 
-static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+static inline bool is_idle_core(int cpu)
 {
-	return -1;
+	return is_idle_cpu(cpu);
 }
 
 #endif /* !CONFIG_SCHED_SMT */
@@ -7864,7 +7864,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
 	for_each_cpu_wrap(cpu, cpus, target) {
 		unsigned long cpu_cap = capacity_of(cpu);
 
-		if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
+		if (!is_idle_cpu(cpu))
 			continue;
 
 		fits = util_fits_cpu(task_util, util_min, util_max, cpu);
@@ -7935,7 +7935,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	 */
 	lockdep_assert_irqs_disabled();
 
-	if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
+	if (is_idle_core(target) &&
 	    asym_fits_cpu(task_util, util_min, util_max, target))
 		return target;
 
@@ -7943,7 +7943,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	 * If the previous CPU is cache affine and idle, don't be stupid:
 	 */
 	if (prev != target && cpus_share_cache(prev, target) &&
-	    (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
+	    is_idle_core(prev) &&
 	    asym_fits_cpu(task_util, util_min, util_max, prev)) {
 
 		if (!static_branch_unlikely(&sched_cluster_active) ||
@@ -7975,7 +7975,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	if (recent_used_cpu != prev &&
 	    recent_used_cpu != target &&
 	    cpus_share_cache(recent_used_cpu, target) &&
-	    (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
+	    is_idle_core(recent_used_cpu) &&
 	    cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) &&
 	    asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
 
@@ -8011,16 +8011,9 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	if (!sd)
 		return target;
 
-	if (sched_smt_active()) {
+	if (sched_smt_active())
 		has_idle_core = test_idle_cores(target);
 
-		if (!has_idle_core && cpus_share_cache(prev, target)) {
-			i = select_idle_smt(p, sd, prev);
-			if ((unsigned int)i < nr_cpumask_bits)
-				return i;
-		}
-	}
-
 	i = select_idle_cpu(p, sd, has_idle_core, target);
 	if ((unsigned)i < nr_cpumask_bits)
 		return i;

From 61fe0eaa6077c5b0c64fdeb84df3a3884469d029 Mon Sep 17 00:00:00 2001
From: Con Kolivas <kernel@kolivas.org>
Date: Wed, 7 Dec 2016 21:13:16 +1100
Subject: [PATCH 21/26] Make threaded IRQs optionally the default which can be
 disabled.

Signed-off-by: Kai Krakow <kai@kaishome.de>
---
 include/linux/interrupt.h |  3 +++
 kernel/irq/Kconfig        | 17 +++++++++++++++++
 kernel/irq/manage.c       | 11 +++++++++++
 3 files changed, 31 insertions(+)

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 51b6484c049345..9fae896aae83f8 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -508,6 +508,9 @@ extern int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
 #ifdef CONFIG_IRQ_FORCED_THREADING
 # ifdef CONFIG_PREEMPT_RT
 #  define force_irqthreads()	(true)
+# elif defined(CONFIG_FORCE_IRQ_THREADING)
+DECLARE_STATIC_KEY_TRUE(force_irqthreads_key);
+#  define force_irqthreads()	(static_branch_likely(&force_irqthreads_key))
 # else
 DECLARE_STATIC_KEY_FALSE(force_irqthreads_key);
 #  define force_irqthreads()	(static_branch_unlikely(&force_irqthreads_key))
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 1b4254d19a73ec..40d19d6826afdd 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -109,6 +109,23 @@ config GENERIC_IRQ_STAT_SNAPSHOT
 config IRQ_FORCED_THREADING
        bool
 
+config FORCE_IRQ_THREADING
+	bool "Make IRQ threading compulsory"
+	depends on IRQ_FORCED_THREADING
+	default n
+	help
+
+	  Make IRQ threading mandatory for any IRQ handlers that support it
+	  instead of being optional and requiring the threadirqs kernel
+	  parameter. Instead they can be optionally disabled with the
+	  nothreadirqs kernel parameter.
+
+	  Enabling this may make some architectures not boot with runqueue
+	  sharing and MuQSS.
+
+	  Enable if you are building for a desktop or low latency system,
+	  otherwise say N.
+
 config SPARSE_IRQ
 	bool "Support sparse irq numbering" if MAY_HAVE_SPARSE_IRQ
 	help
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 400856abf67219..56f4ccc02dac37 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -25,7 +25,18 @@
 #include "internals.h"
 
 #if defined(CONFIG_IRQ_FORCED_THREADING) && !defined(CONFIG_PREEMPT_RT)
+#ifdef CONFIG_FORCE_IRQ_THREADING
+DEFINE_STATIC_KEY_TRUE(force_irqthreads_key);
+#else
 DEFINE_STATIC_KEY_FALSE(force_irqthreads_key);
+#endif
+
+static int __init setup_noforced_irqthreads(char *arg)
+{
+	static_branch_disable(&force_irqthreads_key);
+	return 0;
+}
+early_param("nothreadirqs", setup_noforced_irqthreads);
 
 static int __init setup_forced_irqthreads(char *arg)
 {

From c02e4bab504a2be1e6d59012444029cfaa5f6934 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Almeida?= <andrealmeid@collabora.com>
Date: Mon, 25 Oct 2021 09:49:42 -0300
Subject: [PATCH 22/26] futex: Add entry point for FUTEX_WAIT_MULTIPLE (opcode
 31)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add an option to wait on multiple futexes using the old interface, that
uses opcode 31 through futex() syscall. Do that by just translation the
old interface to use the new code. This allows old and stable versions
of Proton to still use fsync in new kernel releases.

Signed-off-by: André Almeida <andrealmeid@collabora.com>
---
 include/uapi/linux/futex.h | 13 +++++++
 kernel/futex/syscalls.c    | 75 +++++++++++++++++++++++++++++++++++++-
 2 files changed, 87 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h
index 7e2744ec89336a..87126e49fc3009 100644
--- a/include/uapi/linux/futex.h
+++ b/include/uapi/linux/futex.h
@@ -22,6 +22,7 @@
 #define FUTEX_WAIT_REQUEUE_PI	11
 #define FUTEX_CMP_REQUEUE_PI	12
 #define FUTEX_LOCK_PI2		13
+#define FUTEX_WAIT_MULTIPLE	31
 
 #define FUTEX_PRIVATE_FLAG	128
 #define FUTEX_CLOCK_REALTIME	256
@@ -100,6 +101,18 @@ struct futex_waitv {
 	__u32 __reserved;
 };
 
+/**
+ * struct futex_wait_block - Block of futexes to be waited for
+ * @uaddr:	User address of the futex
+ * @val:	Futex value expected by userspace
+ * @bitset:	Bitset for the optional bitmasked wakeup
+ */
+struct futex_wait_block {
+	__u32 __user *uaddr;
+	__u32 val;
+	__u32 bitset;
+};
+
 /*
  * Support for robust futexes: the kernel cleans up held futexes at
  * thread exit time.
diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
index 880c9bf2f31504..fb3b617b9ed83d 100644
--- a/kernel/futex/syscalls.c
+++ b/kernel/futex/syscalls.c
@@ -166,6 +166,7 @@ static __always_inline bool futex_cmd_has_timeout(u32 cmd)
 	case FUTEX_LOCK_PI2:
 	case FUTEX_WAIT_BITSET:
 	case FUTEX_WAIT_REQUEUE_PI:
+	case FUTEX_WAIT_MULTIPLE:
 		return true;
 	}
 	return false;
@@ -178,13 +179,79 @@ futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t)
 		return -EINVAL;
 
 	*t = timespec64_to_ktime(*ts);
-	if (cmd == FUTEX_WAIT)
+	if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE)
 		*t = ktime_add_safe(ktime_get(), *t);
 	else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME))
 		*t = timens_ktime_to_host(CLOCK_MONOTONIC, *t);
 	return 0;
 }
 
+/**
+ * futex_read_wait_block - Read an array of futex_wait_block from userspace
+ * @uaddr:	Userspace address of the block
+ * @count:	Number of blocks to be read
+ *
+ * This function creates and allocate an array of futex_q (we zero it to
+ * initialize the fields) and then, for each futex_wait_block element from
+ * userspace, fill a futex_q element with proper values.
+ */
+inline struct futex_vector *futex_read_wait_block(u32 __user *uaddr, u32 count)
+{
+	unsigned int i;
+	struct futex_vector *futexv;
+	struct futex_wait_block fwb;
+	struct futex_wait_block __user *entry =
+		(struct futex_wait_block __user *)uaddr;
+
+	if (!count || count > FUTEX_WAITV_MAX)
+		return ERR_PTR(-EINVAL);
+
+	futexv = kcalloc(count, sizeof(*futexv), GFP_KERNEL);
+	if (!futexv)
+		return ERR_PTR(-ENOMEM);
+
+	for (i = 0; i < count; i++) {
+		if (copy_from_user(&fwb, &entry[i], sizeof(fwb))) {
+			kfree(futexv);
+			return ERR_PTR(-EFAULT);
+		}
+
+		futexv[i].w.flags = FUTEX_32;
+		futexv[i].w.val = fwb.val;
+		futexv[i].w.uaddr = (uintptr_t) (fwb.uaddr);
+		futexv[i].q = futex_q_init;
+	}
+
+	return futexv;
+}
+
+int futex_wait_multiple(struct futex_vector *vs, unsigned int count,
+			struct hrtimer_sleeper *to);
+
+int futex_opcode_31(ktime_t *abs_time, u32 __user *uaddr, int count)
+{
+	int ret;
+	struct futex_vector *vs;
+	struct hrtimer_sleeper *to = NULL, timeout;
+
+	to = futex_setup_timer(abs_time, &timeout, 0, 0);
+
+	vs = futex_read_wait_block(uaddr, count);
+
+	if (IS_ERR(vs))
+		return PTR_ERR(vs);
+
+	ret = futex_wait_multiple(vs, count, abs_time ? to : NULL);
+	kfree(vs);
+
+	if (to) {
+		hrtimer_cancel(&to->timer);
+		destroy_hrtimer_on_stack(&to->timer);
+	}
+
+	return ret;
+}
+
 SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
 		const struct __kernel_timespec __user *, utime,
 		u32 __user *, uaddr2, u32, val3)
@@ -204,6 +271,9 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
 		tp = &t;
 	}
 
+	if (cmd == FUTEX_WAIT_MULTIPLE)
+		return futex_opcode_31(tp, uaddr, val);
+
 	return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
 }
 
@@ -512,6 +582,9 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val,
 		tp = &t;
 	}
 
+	if (cmd == FUTEX_WAIT_MULTIPLE)
+		return futex_opcode_31(tp, uaddr, val);
+
 	return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
 }
 #endif /* CONFIG_COMPAT_32BIT_TIME */

From 20aa2cc967b3a71523cef87cacdd38e5c7c9ec77 Mon Sep 17 00:00:00 2001
From: Paul Gofman <pgofman@codeweavers.com>
Date: Thu, 7 May 2020 14:05:31 +0300
Subject: [PATCH 23/26] mm: Support soft dirty flag read with reset.

v2: ported from 6.1 to 6.6
v3: ported from 6.12 to 6.18

Signed-off-by: Kai Krakow <kai@kaishome.de>
---
 fs/proc/base.c     |   3 +
 fs/proc/internal.h |   1 +
 fs/proc/task_mmu.c | 140 +++++++++++++++++++++++++++++++++++++++------
 3 files changed, 128 insertions(+), 16 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 6299878e3d97e6..8558fb1c26c0a6 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3357,6 +3357,9 @@ static const struct pid_entry tgid_base_stuff[] = {
 	REG("smaps",      S_IRUGO, proc_pid_smaps_operations),
 	REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
 	REG("pagemap",    S_IRUSR, proc_pagemap_operations),
+#ifdef CONFIG_MEM_SOFT_DIRTY
+	REG("pagemap_reset", S_IRUSR, proc_pagemap_reset_operations),
+#endif
 #endif
 #ifdef CONFIG_SECURITY
 	DIR("attr",       S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index d1598576506c1e..5898cd154fd252 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -406,6 +406,7 @@ extern const struct file_operations proc_pid_smaps_operations;
 extern const struct file_operations proc_pid_smaps_rollup_operations;
 extern const struct file_operations proc_clear_refs_operations;
 extern const struct file_operations proc_pagemap_operations;
+extern const struct file_operations proc_pagemap_reset_operations;
 
 extern unsigned long task_vsize(struct mm_struct *);
 extern unsigned long task_statm(struct mm_struct *,
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index fc35a0543f0191..59bd938fe73ac9 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1600,7 +1600,7 @@ static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr,
 	return folio_maybe_dma_pinned(folio);
 }
 
-static inline void clear_soft_dirty(struct vm_area_struct *vma,
+static inline bool clear_soft_dirty(struct vm_area_struct *vma,
 		unsigned long addr, pte_t *pte)
 {
 	/*
@@ -1610,37 +1610,46 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
 	 * of how soft-dirty works.
 	 */
 	pte_t ptent = ptep_get(pte);
+	bool ret = false;
 
 	if (pte_present(ptent)) {
 		pte_t old_pte;
 
 		if (pte_is_pinned(vma, addr, ptent))
-			return;
+			return ret;
 		old_pte = ptep_modify_prot_start(vma, addr, pte);
+		ret = pte_soft_dirty(old_pte);
 		ptent = pte_wrprotect(old_pte);
 		ptent = pte_clear_soft_dirty(ptent);
 		ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
 	} else if (is_swap_pte(ptent)) {
+		ret = pte_swp_soft_dirty(ptent);
 		ptent = pte_swp_clear_soft_dirty(ptent);
 		set_pte_at(vma->vm_mm, addr, pte, ptent);
 	}
+    return ret;
 }
 #else
-static inline void clear_soft_dirty(struct vm_area_struct *vma,
+static inline bool clear_soft_dirty(struct vm_area_struct *vma,
 		unsigned long addr, pte_t *pte)
 {
+    return false;
 }
 #endif
 
 #if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
-static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
+static inline bool clear_soft_dirty_pmd(struct vm_area_struct *vma,
 		unsigned long addr, pmd_t *pmdp)
 {
 	pmd_t old, pmd = *pmdp;
+	bool ret = false;
 
 	if (pmd_present(pmd)) {
 		/* See comment in change_huge_pmd() */
 		old = pmdp_invalidate(vma, addr, pmdp);
+
+		ret = pmd_soft_dirty(old);
+
 		if (pmd_dirty(old))
 			pmd = pmd_mkdirty(pmd);
 		if (pmd_young(old))
@@ -1651,14 +1660,17 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
 
 		set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
 	} else if (is_migration_entry(pmd_to_swp_entry(pmd))) {
+		ret = pmd_swp_soft_dirty(pmd);
 		pmd = pmd_swp_clear_soft_dirty(pmd);
 		set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
 	}
+    return ret;
 }
 #else
-static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
+static inline bool clear_soft_dirty_pmd(struct vm_area_struct *vma,
 		unsigned long addr, pmd_t *pmdp)
 {
+    return false;
 }
 #endif
 
@@ -1838,6 +1850,7 @@ struct pagemapread {
 	int pos, len;		/* units: PM_ENTRY_BYTES, not bytes */
 	pagemap_entry_t *buffer;
 	bool show_pfn;
+	bool reset;
 };
 
 #define PAGEMAP_WALK_SIZE	(PMD_SIZE)
@@ -1848,6 +1861,7 @@ struct pagemapread {
 #define PM_PFRAME_MASK		GENMASK_ULL(PM_PFRAME_BITS - 1, 0)
 #define PM_SOFT_DIRTY		BIT_ULL(55)
 #define PM_MMAP_EXCLUSIVE	BIT_ULL(56)
+#define PM_SOFT_DIRTY_PAGE	BIT_ULL(57)
 #define PM_UFFD_WP		BIT_ULL(57)
 #define PM_GUARD_REGION		BIT_ULL(58)
 #define PM_FILE			BIT_ULL(61)
@@ -1869,6 +1883,14 @@ static int add_to_pagemap(pagemap_entry_t *pme, struct pagemapread *pm)
 	return 0;
 }
 
+static int add_addr_to_pagemap(unsigned long addr, struct pagemapread *pm)
+{
+	((unsigned long *)pm->buffer)[pm->pos++] = addr;
+	if (pm->pos >= pm->len)
+		return PM_END_OF_BUFFER;
+	return 0;
+}
+
 static bool __folio_page_mapped_exclusively(struct folio *folio, struct page *page)
 {
 	if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
@@ -1883,6 +1905,9 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
 	unsigned long addr = start;
 	int err = 0;
 
+	if (pm->reset)
+		goto out;
+
 	while (addr < end) {
 		struct vm_area_struct *vma = find_vma(walk->mm, addr);
 		pagemap_entry_t pme = make_pme(0, 0);
@@ -1993,6 +2018,20 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
 		struct page *page = NULL;
 		struct folio *folio = NULL;
 
+		if (pm->reset)
+		{
+			if (clear_soft_dirty_pmd(vma, addr, pmdp))
+			{
+				for (; addr != end; addr += PAGE_SIZE)
+				{
+					err = add_addr_to_pagemap(addr, pm);
+					if (err)
+						break;
+				}
+			}
+			goto trans_huge_done;
+		}
+
 		if (vma->vm_flags & VM_SOFTDIRTY)
 			flags |= PM_SOFT_DIRTY;
 
@@ -2055,6 +2094,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
 					frame += (1 << MAX_SWAPFILES_SHIFT);
 			}
 		}
+trans_huge_done:
 		spin_unlock(ptl);
 		return err;
 	}
@@ -2070,10 +2110,18 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
 		return err;
 	}
 	for (; addr < end; pte++, addr += PAGE_SIZE) {
-		pagemap_entry_t pme;
+		if (pm->reset)
+		{
+			if (clear_soft_dirty(vma, addr, pte))
+			    err = add_addr_to_pagemap(addr, pm);
+		}
+		else
+		{
+			pagemap_entry_t pme;
 
-		pme = pte_to_pagemap_entry(pm, vma, addr, ptep_get(pte));
-		err = add_to_pagemap(&pme, pm);
+			pme = pte_to_pagemap_entry(pm, vma, addr, ptep_get(pte));
+			err = add_to_pagemap(&pme, pm);
+		}
 		if (err)
 			break;
 	}
@@ -2177,8 +2225,8 @@ static const struct mm_walk_ops pagemap_ops = {
  * determine which areas of memory are actually mapped and llseek to
  * skip over unmapped regions.
  */
-static ssize_t pagemap_read(struct file *file, char __user *buf,
-			    size_t count, loff_t *ppos)
+static ssize_t do_pagemap_read(struct file *file, char __user *buf,
+			    size_t count, loff_t *ppos, bool reset)
 {
 	struct mm_struct *mm = file->private_data;
 	struct pagemapread pm;
@@ -2187,6 +2235,8 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 	unsigned long start_vaddr;
 	unsigned long end_vaddr;
 	int ret = 0, copied = 0;
+	struct mmu_notifier_range range;
+	size_t buffer_len;
 
 	if (!mm || !mmget_not_zero(mm))
 		goto out;
@@ -2202,19 +2252,38 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 
 	/* do not disclose physical addresses: attack vector */
 	pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN);
+	pm.reset = reset;
 
-	pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
-	pm.buffer = kmalloc_array(pm.len, PM_ENTRY_BYTES, GFP_KERNEL);
+	buffer_len = min(PAGEMAP_WALK_SIZE >> PAGE_SHIFT, count / PM_ENTRY_BYTES);
+
+	pm.buffer = kmalloc_array(buffer_len, PM_ENTRY_BYTES, GFP_KERNEL);
 	ret = -ENOMEM;
 	if (!pm.buffer)
 		goto out_mm;
 
 	src = *ppos;
 	svpfn = src / PM_ENTRY_BYTES;
-	end_vaddr = mm->task_size;
+
+	start_vaddr = svpfn << PAGE_SHIFT;
+
+	if (reset)
+	{
+		if (count < sizeof(end_vaddr))
+		{
+			ret = -EINVAL;
+			goto out_mm;
+		}
+		if (copy_from_user(&end_vaddr, buf, sizeof(end_vaddr)))
+			return -EFAULT;
+		end_vaddr = min(end_vaddr, mm->task_size);
+	}
+	else
+	{
+		end_vaddr = mm->task_size;
+		start_vaddr = end_vaddr;
+	}
 
 	/* watch out for wraparound */
-	start_vaddr = end_vaddr;
 	if (svpfn <= (ULONG_MAX >> PAGE_SHIFT)) {
 		unsigned long end;
 
@@ -2239,18 +2308,35 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 		unsigned long end;
 
 		pm.pos = 0;
-		end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK;
+		pm.len = min(buffer_len, count / PM_ENTRY_BYTES);
+
+		end = reset ? end_vaddr : (start_vaddr + (pm.len << PAGE_SHIFT));
 		/* overflow ? */
 		if (end < start_vaddr || end > end_vaddr)
 			end = end_vaddr;
+
 		ret = mmap_read_lock_killable(mm);
 		if (ret)
 			goto out_free;
+
+		if (reset)
+		{
+			inc_tlb_flush_pending(mm);
+			mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY,
+						0, mm, start_vaddr, end);
+			mmu_notifier_invalidate_range_start(&range);
+		}
 		ret = walk_page_range(mm, start_vaddr, end, &pagemap_ops, &pm);
+		if (reset)
+		{
+			mmu_notifier_invalidate_range_end(&range);
+			flush_tlb_mm(mm);
+			dec_tlb_flush_pending(mm);
+		}
 		mmap_read_unlock(mm);
-		start_vaddr = end;
 
 		len = min(count, PM_ENTRY_BYTES * pm.pos);
+		BUG_ON(ret && ret != PM_END_OF_BUFFER);
 		if (copy_to_user(buf, pm.buffer, len)) {
 			ret = -EFAULT;
 			goto out_free;
@@ -2258,6 +2344,8 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 		copied += len;
 		buf += len;
 		count -= len;
+
+		start_vaddr = reset && pm.pos == pm.len ? ((unsigned long *)pm.buffer)[pm.pos - 1] + PAGE_SIZE : end;
 	}
 	*ppos += copied;
 	if (!ret || ret == PM_END_OF_BUFFER)
@@ -2271,6 +2359,18 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 	return ret;
 }
 
+static ssize_t pagemap_read(struct file *file, char __user *buf,
+			    size_t count, loff_t *ppos)
+{
+	return do_pagemap_read(file, buf, count, ppos, false);
+}
+
+static ssize_t pagemap_reset_read(struct file *file, char __user *buf,
+			    size_t count, loff_t *ppos)
+{
+	return do_pagemap_read(file, buf, count, ppos, true);
+}
+
 static int pagemap_open(struct inode *inode, struct file *file)
 {
 	struct mm_struct *mm;
@@ -3065,6 +3165,14 @@ const struct file_operations proc_pagemap_operations = {
 	.unlocked_ioctl = do_pagemap_cmd,
 	.compat_ioctl	= do_pagemap_cmd,
 };
+
+const struct file_operations proc_pagemap_reset_operations = {
+	.llseek		= mem_lseek, /* borrow this */
+	.read		= pagemap_reset_read,
+	.open		= pagemap_open,
+	.release	= pagemap_release,
+};
+
 #endif /* CONFIG_PROC_PAGE_MONITOR */
 
 #ifdef CONFIG_NUMA

From 324549f66d20cc96cd81761f63098a61051e8d34 Mon Sep 17 00:00:00 2001
From: Paul Gofman <pgofman@codeweavers.com>
Date: Wed, 6 May 2020 14:37:44 +0300
Subject: [PATCH 24/26] mm: Support soft dirty flag reset for VA range.

v2: ported from 6.1 to 6.6

Signed-off-by: Kai Krakow <kai@kaishome.de>
---
 fs/proc/task_mmu.c | 128 ++++++++++++++++++++++++++++++++++++---------
 1 file changed, 102 insertions(+), 26 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 59bd938fe73ac9..8e0afdd680fcf3 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1580,6 +1580,8 @@ enum clear_refs_types {
 
 struct clear_refs_private {
 	enum clear_refs_types type;
+	unsigned long start, end;
+	bool clear_range;
 };
 
 #ifdef CONFIG_MEM_SOFT_DIRTY
@@ -1683,6 +1685,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
 	spinlock_t *ptl;
 	struct folio *folio;
 
+	BUG_ON(addr < cp->start || end > cp->end);
+
 	ptl = pmd_trans_huge_lock(pmd, vma);
 	if (ptl) {
 		if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
@@ -1740,9 +1744,11 @@ static int clear_refs_test_walk(unsigned long start, unsigned long end,
 	struct clear_refs_private *cp = walk->private;
 	struct vm_area_struct *vma = walk->vma;
 
-	if (vma->vm_flags & VM_PFNMAP)
+	if (!cp->clear_range && (vma->vm_flags & VM_PFNMAP))
 		return 1;
 
+	BUG_ON(start < cp->start || end > cp->end);
+
 	/*
 	 * Writing 1 to /proc/pid/clear_refs affects all pages.
 	 * Writing 2 to /proc/pid/clear_refs only affects anonymous pages.
@@ -1766,10 +1772,12 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 				size_t count, loff_t *ppos)
 {
 	struct task_struct *task;
-	char buffer[PROC_NUMBUF] = {};
+	char buffer[18] = {};
 	struct mm_struct *mm;
 	struct vm_area_struct *vma;
 	enum clear_refs_types type;
+	unsigned long start, end;
+	bool clear_range;
 	int itype;
 	int rv;
 
@@ -1777,12 +1785,34 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 		count = sizeof(buffer) - 1;
 	if (copy_from_user(buffer, buf, count))
 		return -EFAULT;
-	rv = kstrtoint(strstrip(buffer), 10, &itype);
-	if (rv < 0)
-		return rv;
-	type = (enum clear_refs_types)itype;
-	if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
-		return -EINVAL;
+
+	if (buffer[0] == '6')
+	{
+		static int once;
+
+		if (!once++)
+			printk(KERN_DEBUG "task_mmu: Using POC clear refs range implementation.\n");
+
+		if (count != 17)
+			return -EINVAL;
+
+		type = CLEAR_REFS_SOFT_DIRTY;
+		start = *(unsigned long *)(buffer + 1);
+		end = *(unsigned long *)(buffer + 1 + 8);
+	}
+	else
+	{
+		rv = kstrtoint(strstrip(buffer), 10, &itype);
+		if (rv < 0)
+			return rv;
+		type = (enum clear_refs_types)itype;
+
+		if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
+			return -EINVAL;
+
+		start = 0;
+		end = -1UL;
+	}
 
 	task = get_proc_task(file_inode(file));
 	if (!task)
@@ -1795,40 +1825,86 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 			.type = type,
 		};
 
-		if (mmap_write_lock_killable(mm)) {
-			count = -EINTR;
-			goto out_mm;
+		if (start || end != -1UL)
+		{
+			start = min(start, -1UL) & PAGE_MASK;
+			end = min(end, -1UL) & PAGE_MASK;
+
+			if (start >= end)
+			{
+				count = -EINVAL;
+				goto out_mm;
+			}
+			clear_range = true;
 		}
+		else
+		{
+			clear_range = false;
+		}
+
+		cp.start = start;
+		cp.end = end;
+		cp.clear_range = clear_range;
+
 		if (type == CLEAR_REFS_MM_HIWATER_RSS) {
+			if (mmap_write_lock_killable(mm)) {
+				count = -EINTR;
+				goto out_mm;
+			}
+
 			/*
 			 * Writing 5 to /proc/pid/clear_refs resets the peak
 			 * resident set size to this mm's current rss value.
 			 */
 			reset_mm_hiwater_rss(mm);
-			goto out_unlock;
+			mmap_write_unlock(mm);
+			goto out_mm;
 		}
 
 		if (type == CLEAR_REFS_SOFT_DIRTY) {
-			for_each_vma(vmi, vma) {
-				if (!(vma->vm_flags & VM_SOFTDIRTY))
-					continue;
-				vm_flags_clear(vma, VM_SOFTDIRTY);
-				vma_set_page_prot(vma);
+			if (mmap_read_lock_killable(mm)) {
+				count = -EINTR;
+				goto out_mm;
 			}
-
+			if (!clear_range)
+				for_each_vma(vmi, vma) {
+				    if (!(vma->vm_flags & VM_SOFTDIRTY))
+					    continue;
+				    mmap_read_unlock(mm);
+				    if (mmap_write_lock_killable(mm)) {
+					    count = -EINTR;
+					    goto out_mm;
+				    }
+					for_each_vma(vmi, vma) {
+						vm_flags_clear(vma, VM_SOFTDIRTY);
+					    vma_set_page_prot(vma);
+				    }
+				    mmap_write_downgrade(mm);
+				    break;
+				}
 			inc_tlb_flush_pending(mm);
 			mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY,
-						0, mm, 0, -1UL);
+						0, mm, start, end);
 			mmu_notifier_invalidate_range_start(&range);
 		}
-		walk_page_range(mm, 0, -1, &clear_refs_walk_ops, &cp);
+		else
+		{
+			if (mmap_write_lock_killable(mm)) {
+				count = -EINTR;
+				goto out_mm;
+			}
+		}
+		walk_page_range(mm, start, end == -1UL ? -1 : end, &clear_refs_walk_ops, &cp);
 		if (type == CLEAR_REFS_SOFT_DIRTY) {
 			mmu_notifier_invalidate_range_end(&range);
 			flush_tlb_mm(mm);
 			dec_tlb_flush_pending(mm);
+			mmap_read_unlock(mm);
+		}
+		else
+		{
+			mmap_write_unlock(mm);
 		}
-out_unlock:
-		mmap_write_unlock(mm);
 out_mm:
 		mmput(mm);
 	}
@@ -1954,13 +2030,13 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
 		flags |= PM_PRESENT;
 		page = vm_normal_page(vma, addr, pte);
 		if (pte_soft_dirty(pte))
-			flags |= PM_SOFT_DIRTY;
+			flags |= PM_SOFT_DIRTY | PM_SOFT_DIRTY_PAGE;
 		if (pte_uffd_wp(pte))
 			flags |= PM_UFFD_WP;
 	} else if (is_swap_pte(pte)) {
 		swp_entry_t entry;
 		if (pte_swp_soft_dirty(pte))
-			flags |= PM_SOFT_DIRTY;
+			flags |= PM_SOFT_DIRTY | PM_SOFT_DIRTY_PAGE;
 		if (pte_swp_uffd_wp(pte))
 			flags |= PM_UFFD_WP;
 		entry = pte_to_swp_entry(pte);
@@ -2040,7 +2116,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
 
 			flags |= PM_PRESENT;
 			if (pmd_soft_dirty(pmd))
-				flags |= PM_SOFT_DIRTY;
+				flags |= PM_SOFT_DIRTY | PM_SOFT_DIRTY_PAGE;
 			if (pmd_uffd_wp(pmd))
 				flags |= PM_UFFD_WP;
 			if (pm->show_pfn)
@@ -2061,7 +2137,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
 			}
 			flags |= PM_SWAP;
 			if (pmd_swp_soft_dirty(pmd))
-				flags |= PM_SOFT_DIRTY;
+				flags |= PM_SOFT_DIRTY | PM_SOFT_DIRTY_PAGE;
 			if (pmd_swp_uffd_wp(pmd))
 				flags |= PM_UFFD_WP;
 			VM_BUG_ON(!is_pmd_migration_entry(pmd));

From cf4fa33827fb048d696394f6b85fd64331138605 Mon Sep 17 00:00:00 2001
From: Masahito S <firelzrd@gmail.com>
Date: Tue, 16 Dec 2025 19:52:04 +0900
Subject: [PATCH 25/26] ADIOS: Backport linux6.19-rc1-ADIOS-3.1.7

ADIOS (Adaptive Deadline I/O Scheduler) is a block layer I/O scheduler
for the Linux kernel, designed for modern multi-queue block devices
(blk-mq). It aims to provide low latency for I/O operations by
combining deadline scheduling principles with a learning-based adaptive
latency control mechanism.

Link: https://github.com/firelzrd/adios
---
 block/Kconfig.iosched |   14 +
 block/Makefile        |    8 +
 block/adios.c         | 2006 +++++++++++++++++++++++++++++++++++++++++
 block/elevator.c      |    9 +-
 4 files changed, 2035 insertions(+), 2 deletions(-)
 create mode 100644 block/adios.c

diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 27f11320b8d12d..e98585dd83e0f4 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -16,6 +16,20 @@ config MQ_IOSCHED_KYBER
 	  synchronous writes, it will self-tune queue depths to achieve that
 	  goal.
 
+config MQ_IOSCHED_ADIOS
+	tristate "Adaptive Deadline I/O scheduler"
+	default m
+	help
+	  The Adaptive Deadline I/O Scheduler (ADIOS) is a multi-queue I/O
+	  scheduler with learning-based adaptive latency control.
+
+config MQ_IOSCHED_DEFAULT_ADIOS
+	bool "Enable ADIOS I/O scheduler as default MQ I/O scheduler"
+	depends on MQ_IOSCHED_ADIOS=y
+	default n
+	help
+	  Enable the ADIOS I/O scheduler as the default scheduler for MQ I/O.
+
 config IOSCHED_BFQ
 	tristate "BFQ I/O scheduler"
 	select BLK_ICQ
diff --git a/block/Makefile b/block/Makefile
index c65f4da937026f..105b12fd86b842 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -22,6 +22,7 @@ obj-$(CONFIG_BLK_CGROUP_IOLATENCY)	+= blk-iolatency.o
 obj-$(CONFIG_BLK_CGROUP_IOCOST)	+= blk-iocost.o
 obj-$(CONFIG_MQ_IOSCHED_DEADLINE)	+= mq-deadline.o
 obj-$(CONFIG_MQ_IOSCHED_KYBER)	+= kyber-iosched.o
+obj-$(CONFIG_MQ_IOSCHED_ADIOS)	+= adios.o
 bfq-y				:= bfq-iosched.o bfq-wf2q.o bfq-cgroup.o
 obj-$(CONFIG_IOSCHED_BFQ)	+= bfq.o
 
@@ -36,3 +37,10 @@ obj-$(CONFIG_BLK_INLINE_ENCRYPTION)	+= blk-crypto.o blk-crypto-profile.o \
 					   blk-crypto-sysfs.o
 obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK)	+= blk-crypto-fallback.o
 obj-$(CONFIG_BLOCK_HOLDER_DEPRECATED)	+= holder.o
+
+all:
+	make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules
+
+clean:
+	make -C /lib/modules/$(shell uname -r)/build M=$(PWD) clean
+
diff --git a/block/adios.c b/block/adios.c
new file mode 100644
index 00000000000000..41de52785cd1ed
--- /dev/null
+++ b/block/adios.c
@@ -0,0 +1,2006 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Adaptive Deadline I/O Scheduler (ADIOS)
+ * Copyright (C) 2025 Masahito Suzuki
+ */
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/compiler.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/math.h>
+#include <linux/module.h>
+#include <linux/rbtree.h>
+#include <linux/sbitmap.h>
+#include <linux/slab.h>
+#include <linux/timekeeping.h>
+#include <linux/percpu.h>
+#include <linux/string.h>
+#include <linux/list_sort.h>
+#include <linux/rcupdate.h>
+
+#include "elevator.h"
+#include "blk.h"
+#include "blk-mq.h"
+#include "blk-mq-sched.h"
+
+#define ADIOS_VERSION "3.1.7"
+
+/* Request Types:
+ *
+ * Tier 0 (Highest Priority): Emergency & System Integrity Requests
+ * -----------------------------------------------------------------
+ * - Target: Requests with the BLK_MQ_INSERT_AT_HEAD flag.
+ * - Purpose: For critical, non-negotiable operations such as device error
+ *   recovery or flush sequences that must bypass all other scheduling logic.
+ * - Implementation: Placed in a dedicated, high-priority FIFO queue
+ *   (`prio_queue[0]`) for immediate dispatch.
+ *
+ * Tier 1 (High Priority): I/O Barrier Guarantees
+ * ---------------------------------------------------------------
+ * - Target: Requests with the REQ_OP_FLUSH flag.
+ * - Purpose: To enforce a strict I/O barrier. When a flush request is
+ *   received, the scheduler stops processing new requests from its main
+ *   queues until all preceding requests have been completed. This guarantees
+ *   the order of operations required by filesystems for data integrity.
+ * - Implementation: A state flag (ADIOS_STATE_BARRIER) halts
+ *   insertion into the main deadline tree. The barrier request and all
+ *   subsequent requests are held in a temporary `barrier_queue`. Once the
+ *   main queues are drained, the barrier request and the subsequent requests
+ *   are released from the pending queue back into the scheduler.
+ *
+ * Tier 2 (Medium Priority): Application Responsiveness
+ * ----------------------------------------------------
+ * - Target: Normal synchronous requests (e.g., from standard file reads).
+ * - Purpose: To ensure correct application behavior for operations that
+ *   depend on sequential I/O completion (e.g., file system mounts) and to
+ *   provide low latency for interactive applications.
+ * - Implementation: The deadline for these requests is set to their start
+ *   time (`rq->start_time_ns`). This effectively enforces FIFO-like behavior
+ *   within the deadline-sorted red-black tree, preventing out-of-order
+ *   execution of dependent synchronous operations.
+ *
+ * Tier 3 (Normal Priority): Background Throughput
+ * -----------------------------------------------
+ * - Target: Asynchronous requests.
+ * - Purpose: To maximize disk throughput for background tasks where latency
+ *   is not critical.
+ * - Implementation: These are the only requests where ADIOS's adaptive
+ *   latency prediction model is used. A dynamic deadline is calculated based
+ *   on the predicted I/O latency, allowing for aggressive reordering to
+ *   optimize I/O efficiency.
+ *
+ * Dispatch Logic:
+ * The scheduler always dispatches requests in strict priority order:
+ * 1. prio_queue[0] (Tier 0)
+ * 2. The deadline-sorted batch queue (which naturally prioritizes Tier 2
+ *    over Tier 3 due to their calculated deadlines).
+ * 3. Barrier-pending requests are handled only after the main queues are empty.
+ */
+
+// Global variable to control the latency
+static u64 default_global_latency_window            = 16000000ULL;
+static u64 default_global_latency_window_rotational = 22000000ULL;
+// Ratio below which batch queues should be refilled
+static u8  default_bq_refill_below_ratio = 20;
+// Maximum latency sample to input
+static u64 default_lat_model_latency_limit = 500000000ULL;
+// Batch ordering strategy
+static u64 default_batch_order = 0;
+
+/* Compliance Flags:
+ * 0x1: Async requests will not be reordered based on the predicted latency
+ */
+enum adios_compliance_flags {
+	ADIOS_CF_FIXORDER  = 1U << 0,
+};
+
+// Flags to control compliance with block layer constraints
+static u64 default_compliance_flags = 0x0;
+
+// Dynamic thresholds for shrinkage
+static u32 default_lm_shrink_at_kreqs  =  5000;
+static u32 default_lm_shrink_at_gbytes =    50;
+static u32 default_lm_shrink_resist    =     2;
+
+enum adios_optype {
+	ADIOS_READ    = 0,
+	ADIOS_WRITE   = 1,
+	ADIOS_DISCARD = 2,
+	ADIOS_OTHER   = 3,
+	ADIOS_OPTYPES = 4,
+};
+
+// Latency targets for each operation type
+static u64 default_latency_target[ADIOS_OPTYPES] = {
+	[ADIOS_READ]    =     2ULL * NSEC_PER_MSEC,
+	[ADIOS_WRITE]   =  2000ULL * NSEC_PER_MSEC,
+	[ADIOS_DISCARD] =  8000ULL * NSEC_PER_MSEC,
+	[ADIOS_OTHER]   =     0ULL * NSEC_PER_MSEC,
+};
+
+// Maximum batch size limits for each operation type
+static u32 default_batch_limit[ADIOS_OPTYPES] = {
+	[ADIOS_READ]    = 36,
+	[ADIOS_WRITE]   = 72,
+	[ADIOS_DISCARD] =  1,
+	[ADIOS_OTHER]   =  1,
+};
+
+enum adios_batch_order {
+	ADIOS_BO_OPTYPE   = 0,
+	ADIOS_BO_ELEVATOR = 1,
+};
+
+// Thresholds for latency model control
+#define LM_BLOCK_SIZE_THRESHOLD 4096
+#define LM_SAMPLES_THRESHOLD    1024
+#define LM_INTERVAL_THRESHOLD   1500
+#define LM_OUTLIER_PERCENTILE     99
+#define LM_LAT_BUCKET_COUNT       64
+
+#define ADIOS_PQ_LEVELS 2
+#define ADIOS_DL_TYPES  2
+#define ADIOS_BQ_PAGES  2
+
+static u32 default_dl_prio[ADIOS_DL_TYPES] = {8, 0};
+
+// Bit flags for the atomic state variable, indicating which queues have requests.
+enum adios_state_flags {
+	ADIOS_STATE_PQ_0      = 1U << 0,
+	ADIOS_STATE_PQ_1      = 1U << 1,
+	ADIOS_STATE_DL_0      = 1U << 2,
+	ADIOS_STATE_DL_1      = 1U << 3,
+	ADIOS_STATE_BQ_PAGE_0 = 1U << 4,
+	ADIOS_STATE_BQ_PAGE_1 = 1U << 5,
+	ADIOS_STATE_BARRIER   = 1U << 6,
+};
+#define ADIOS_STATE_PQ 0
+#define ADIOS_STATE_DL 2
+#define ADIOS_STATE_BQ 4
+#define ADIOS_STATE_BP 6
+
+// Temporal granularity of the deadline tree node (dl_group)
+#define ADIOS_QUANTUM_SHIFT 20
+
+#define ADIOS_MAX_INSERTS_PER_LOCK 72
+#define ADIOS_MAX_DELETES_PER_LOCK 24
+
+// Structure to hold latency bucket data for small requests
+struct latency_bucket_small {
+	u64 weighted_sum_latency;
+	u64 sum_of_weights;
+};
+
+// Structure to hold latency bucket data for large requests
+struct latency_bucket_large {
+	u64 weighted_sum_latency;
+	u64 weighted_sum_block_size;
+	u64 sum_of_weights;
+};
+
+// Structure to hold per-cpu buckets, improving data locality and code clarity.
+struct lm_buckets {
+	struct latency_bucket_small small_bucket[LM_LAT_BUCKET_COUNT];
+	struct latency_bucket_large large_bucket[LM_LAT_BUCKET_COUNT];
+};
+
+// Structure to hold RCU-protected latency model parameters
+struct latency_model_params {
+	u64 base;
+	u64 slope;
+	u64 small_sum_delay;
+	u64 small_count;
+	u64 large_sum_delay;
+	u64 large_sum_bsize;
+	u64 last_update_jiffies;
+	struct rcu_head rcu;
+};
+
+// Structure to hold the latency model context data
+struct latency_model {
+	spinlock_t update_lock;
+	struct latency_model_params __rcu *params;
+
+	// Per-CPU buckets to avoid lock contention on the completion path
+	struct lm_buckets __percpu *pcpu_buckets;
+
+	u32 lm_shrink_at_kreqs;
+	u32 lm_shrink_at_gbytes;
+	u8  lm_shrink_resist;
+};
+
+union adios_in_flight_rqs {
+	atomic64_t	atomic;
+	u64			scalar;
+	struct {
+		u64 	count:          16;
+		u64 	total_pred_lat: 48;
+	};
+};
+
+// Adios scheduler data
+struct adios_data {
+	spinlock_t pq_lock;
+	struct list_head prio_queue[2];
+
+	struct rb_root_cached dl_tree[2];
+	spinlock_t lock;
+	s64 dl_bias;
+	s32 dl_prio[2];
+
+	atomic_t state;
+	u8  bq_state[ADIOS_BQ_PAGES];
+
+	void (*insert_request_fn)(struct blk_mq_hw_ctx *, struct request *,
+								blk_insert_t, struct list_head *);
+
+	u64 global_latency_window;
+	u64 compliance_flags;
+	u64 latency_target[ADIOS_OPTYPES];
+	u32 batch_limit[ADIOS_OPTYPES];
+	u32 batch_actual_max_size[ADIOS_OPTYPES];
+	u32 batch_actual_max_total;
+	u32 async_depth;
+	u32 lat_model_latency_limit;
+	u8  bq_refill_below_ratio;
+	u8  is_rotational;
+	u8  batch_order;
+	u8  elv_direction;
+	sector_t head_pos;
+	sector_t last_completed_pos;
+
+	bool bq_page;
+	struct list_head batch_queue[ADIOS_BQ_PAGES][ADIOS_OPTYPES];
+	u32 batch_count[ADIOS_BQ_PAGES][ADIOS_OPTYPES];
+	u8  bq_batch_order[ADIOS_BQ_PAGES];
+	spinlock_t bq_lock;
+	spinlock_t barrier_lock;
+	struct list_head barrier_queue;
+
+	struct lm_buckets *aggr_buckets;
+
+	struct latency_model latency_model[ADIOS_OPTYPES];
+	struct timer_list update_timer;
+
+	union adios_in_flight_rqs in_flight_rqs;
+	atomic64_t total_pred_lat;
+	u64 last_completed_time;
+
+	struct kmem_cache *rq_data_pool;
+	struct kmem_cache *dl_group_pool;
+
+	struct request_queue *queue;
+};
+
+// List of requests with the same deadline in the deadline-sorted tree
+struct dl_group {
+	struct rb_node node;
+	struct list_head rqs;
+	u64 deadline;
+} __attribute__((aligned(64)));
+
+// Structure to hold scheduler-specific data for each request
+struct adios_rq_data {
+	struct list_head *dl_group;
+	struct list_head dl_node;
+
+	struct request *rq;
+	u64 deadline;
+	u64 pred_lat;
+	u32 block_size;
+	bool managed;
+} __attribute__((aligned(64)));
+
+static const int adios_prio_to_wmult[40] = {
+ /* -20 */     88761,     71755,     56483,     46273,     36291,
+ /* -15 */     29154,     23254,     18705,     14949,     11916,
+ /* -10 */      9548,      7620,      6100,      4904,      3906,
+ /*  -5 */      3121,      2501,      1991,      1586,      1277,
+ /*   0 */      1024,       820,       655,       526,       423,
+ /*   5 */       335,       272,       215,       172,       137,
+ /*  10 */       110,        87,        70,        56,        45,
+ /*  15 */        36,        29,        23,        18,        15,
+};
+
+static inline bool compliant(struct adios_data *ad, u32 flag) {
+	return ad->compliance_flags & flag;
+}
+
+// Count the number of entries in aggregated small buckets
+static u64 lm_count_small_entries(struct latency_bucket_small *buckets) {
+	u64 total_weight = 0;
+	for (u8 i = 0; i < LM_LAT_BUCKET_COUNT; i++)
+		total_weight += buckets[i].sum_of_weights;
+	return total_weight;
+}
+
+// Update the small buckets in the latency model from aggregated data
+static bool lm_update_small_buckets(struct latency_model *model,
+		struct latency_model_params *params,
+		struct latency_bucket_small *buckets,
+		u64 total_weight, bool count_all) {
+	u64 sum_latency = 0;
+	u64 sum_weight = 0;
+	u64 cumulative_weight = 0, threshold_weight = 0;
+	u8  outlier_threshold_bucket = 0;
+	u8  outlier_percentile = LM_OUTLIER_PERCENTILE;
+	u8  reduction;
+
+	if (count_all)
+		outlier_percentile = 100;
+
+	// Calculate the threshold weight for outlier detection
+	threshold_weight = (total_weight * outlier_percentile) / 100;
+
+	// Identify the bucket that corresponds to the outlier threshold
+	for (u8 i = 0; i < LM_LAT_BUCKET_COUNT; i++) {
+		cumulative_weight += buckets[i].sum_of_weights;
+		if (cumulative_weight >= threshold_weight) {
+			outlier_threshold_bucket = i;
+			break;
+		}
+	}
+
+	// Calculate the average latency, excluding outliers
+	for (u8 i = 0; i <= outlier_threshold_bucket; i++) {
+		struct latency_bucket_small *bucket = &buckets[i];
+		if (i < outlier_threshold_bucket) {
+			sum_latency += bucket->weighted_sum_latency;
+			sum_weight += bucket->sum_of_weights;
+		} else {
+			// The threshold bucket's contribution is proportional
+			u64 remaining_weight =
+				threshold_weight - (cumulative_weight - bucket->sum_of_weights);
+			if (bucket->sum_of_weights > 0) {
+				sum_latency += div_u64(bucket->weighted_sum_latency *
+					remaining_weight, bucket->sum_of_weights);
+				sum_weight += remaining_weight;
+			}
+		}
+	}
+
+	// Shrink the model if it reaches at the readjustment threshold
+	if (params->small_count >= 1000ULL * model->lm_shrink_at_kreqs) {
+		reduction = model->lm_shrink_resist;
+		if (params->small_count >> reduction) {
+			params->small_sum_delay -= params->small_sum_delay >> reduction;
+			params->small_count     -= params->small_count     >> reduction;
+		}
+	}
+
+	if (!sum_weight)
+		return false;
+
+	// Accumulate the average latency into the statistics
+	params->small_sum_delay += sum_latency;
+	params->small_count     += sum_weight;
+
+	return true;
+}
+
+// Count the number of entries in aggregated large buckets
+static u64 lm_count_large_entries(struct latency_bucket_large *buckets) {
+	u64 total_weight = 0;
+	for (u8 i = 0; i < LM_LAT_BUCKET_COUNT; i++)
+		total_weight += buckets[i].sum_of_weights;
+	return total_weight;
+}
+
+// Update the large buckets in the latency model from aggregated data
+static bool lm_update_large_buckets(struct latency_model *model,
+		struct latency_model_params *params,
+		struct latency_bucket_large *buckets,
+		u64 total_weight, bool count_all) {
+	s64 sum_latency = 0;
+	u64 sum_block_size = 0, intercept;
+	u64 cumulative_weight = 0, threshold_weight = 0;
+	u64 sum_weight = 0;
+	u8  outlier_threshold_bucket = 0;
+	u8  outlier_percentile = LM_OUTLIER_PERCENTILE;
+	u8  reduction;
+
+	if (count_all)
+		outlier_percentile = 100;
+
+	// Calculate the threshold weight for outlier detection
+	threshold_weight = (total_weight * outlier_percentile) / 100;
+
+	// Identify the bucket that corresponds to the outlier threshold
+	for (u8 i = 0; i < LM_LAT_BUCKET_COUNT; i++) {
+		cumulative_weight += buckets[i].sum_of_weights;
+		if (cumulative_weight >= threshold_weight) {
+			outlier_threshold_bucket = i;
+			break;
+		}
+	}
+
+	// Calculate the average latency and block size, excluding outliers
+	for (u8 i = 0; i <= outlier_threshold_bucket; i++) {
+		struct latency_bucket_large *bucket = &buckets[i];
+		if (i < outlier_threshold_bucket) {
+			sum_latency += bucket->weighted_sum_latency;
+			sum_block_size += bucket->weighted_sum_block_size;
+			sum_weight += bucket->sum_of_weights;
+		} else {
+			// The threshold bucket's contribution is proportional
+			u64 remaining_weight =
+				threshold_weight - (cumulative_weight - bucket->sum_of_weights);
+			if (bucket->sum_of_weights > 0) {
+				sum_latency += div_u64(bucket->weighted_sum_latency *
+					remaining_weight, bucket->sum_of_weights);
+				sum_block_size += div_u64(bucket->weighted_sum_block_size *
+					remaining_weight, bucket->sum_of_weights);
+				sum_weight += remaining_weight;
+			}
+		}
+	}
+
+	if (!sum_weight)
+		return false;
+
+	// Shrink the model if it reaches at the readjustment threshold
+	if (params->large_sum_bsize >= 0x40000000ULL * model->lm_shrink_at_gbytes) {
+		reduction = model->lm_shrink_resist;
+		if (params->large_sum_bsize >> reduction) {
+			params->large_sum_delay -= params->large_sum_delay >> reduction;
+			params->large_sum_bsize -= params->large_sum_bsize >> reduction;
+		}
+	}
+
+	// Accumulate the average delay into the statistics
+	intercept = params->base;
+	if (sum_latency > intercept)
+		sum_latency -= intercept;
+
+	params->large_sum_delay += sum_latency;
+	params->large_sum_bsize += sum_block_size;
+
+	return true;
+}
+
+static void reset_buckets(struct lm_buckets *buckets)
+{ memset(buckets, 0, sizeof(*buckets)); }
+
+static void lm_reset_pcpu_buckets(struct latency_model *model) {
+	int cpu;
+	for_each_possible_cpu(cpu)
+		reset_buckets(per_cpu_ptr(model->pcpu_buckets, cpu));
+}
+
+// Update the latency model parameters and statistics
+static void latency_model_update(
+		struct adios_data *ad, struct latency_model *model) {
+	u64 now;
+	u64 small_weight, large_weight;
+	bool time_elapsed;
+	bool small_processed = false, large_processed = false;
+	struct lm_buckets *aggr = ad->aggr_buckets;
+	struct latency_bucket_small *asb;
+	struct latency_bucket_large *alb;
+	struct lm_buckets *pcpu_b;
+	unsigned long flags;
+	int cpu;
+	struct latency_model_params *old_params, *new_params;
+
+	spin_lock_irqsave(&model->update_lock, flags);
+
+	old_params = rcu_dereference_protected(model->params,
+				lockdep_is_held(&model->update_lock));
+	new_params = kmemdup(old_params, sizeof(*new_params), GFP_ATOMIC);
+	if (!new_params) {
+		spin_unlock_irqrestore(&model->update_lock, flags);
+		return;
+	}
+
+	// Aggregate data from all CPUs and reset per-cpu buckets.
+	for_each_possible_cpu(cpu) {
+		pcpu_b = per_cpu_ptr(model->pcpu_buckets, cpu);
+
+		for (u8 i = 0; i < LM_LAT_BUCKET_COUNT; i++) {
+			if (pcpu_b->small_bucket[i].sum_of_weights) {
+				asb = &aggr->small_bucket[i];
+				asb->sum_of_weights +=
+					pcpu_b->small_bucket[i].sum_of_weights;
+				asb->weighted_sum_latency +=
+					pcpu_b->small_bucket[i].weighted_sum_latency;
+			}
+			if (pcpu_b->large_bucket[i].sum_of_weights) {
+				alb = &aggr->large_bucket[i];
+				alb->sum_of_weights +=
+					pcpu_b->large_bucket[i].sum_of_weights;
+				alb->weighted_sum_latency +=
+					pcpu_b->large_bucket[i].weighted_sum_latency;
+				alb->weighted_sum_block_size +=
+					pcpu_b->large_bucket[i].weighted_sum_block_size;
+			}
+		}
+		// Reset per-cpu buckets after aggregating
+		reset_buckets(pcpu_b);
+	}
+
+	// Count the number of entries in aggregated buckets
+	small_weight = lm_count_small_entries(aggr->small_bucket);
+	large_weight = lm_count_large_entries(aggr->large_bucket);
+
+	// Whether enough time has elapsed since the last update
+	now = jiffies;
+	time_elapsed = unlikely(!new_params->base) ||
+		new_params->last_update_jiffies +
+		msecs_to_jiffies(LM_INTERVAL_THRESHOLD) <= now;
+
+	// Update small buckets
+	if (small_weight && (time_elapsed ||
+			LM_SAMPLES_THRESHOLD <= small_weight || !new_params->base)) {
+		small_processed = lm_update_small_buckets(model, new_params,
+			aggr->small_bucket, small_weight, !new_params->base);
+		memset(&aggr->small_bucket[0], 0, sizeof(aggr->small_bucket));
+	}
+	// Update large buckets
+	if (large_weight && (time_elapsed ||
+			LM_SAMPLES_THRESHOLD <= large_weight || !new_params->slope)) {
+		large_processed = lm_update_large_buckets(model, new_params,
+			aggr->large_bucket, large_weight, !new_params->slope);
+		memset(&aggr->large_bucket[0], 0, sizeof(aggr->large_bucket));
+	}
+
+	// Update the base parameter if small bucket was processed
+	if (small_processed && likely(new_params->small_count))
+		new_params->base = div_u64(new_params->small_sum_delay,
+			new_params->small_count);
+
+	// Update the slope parameter if large bucket was processed
+	if (large_processed && likely(new_params->large_sum_bsize))
+		new_params->slope = div_u64(new_params->large_sum_delay,
+			DIV_ROUND_UP_ULL(new_params->large_sum_bsize, 1024));
+
+	// Update last updated jiffies if update happened or time has elapsed
+	if (small_processed || large_processed || time_elapsed)
+		new_params->last_update_jiffies = now;
+
+	rcu_assign_pointer(model->params, new_params);
+	spin_unlock_irqrestore(&model->update_lock, flags);
+
+	kfree_rcu(old_params, rcu);
+}
+
+// Determine the bucket index for a given measured and predicted latency
+static u8 lm_input_bucket_index(u64 measured, u64 predicted) {
+	u8 bucket_index;
+
+	if (measured < predicted * 2)
+		bucket_index = div_u64((measured * 20), predicted);
+	else if (measured < predicted * 5)
+		bucket_index = div_u64((measured * 10), predicted) + 20;
+	else
+		bucket_index = div_u64((measured * 3), predicted) + 40;
+
+	return bucket_index;
+}
+
+// Input latency data into the latency model
+static void latency_model_input(struct adios_data *ad,
+		struct latency_model *model,
+		u32 block_size, u64 latency, u64 pred_lat, u32 weight) {
+	unsigned long flags;
+	u8 bucket_index;
+	struct lm_buckets *buckets;
+	u64 current_base;
+	struct latency_model_params *params;
+
+	local_irq_save(flags);
+	buckets = per_cpu_ptr(model->pcpu_buckets, __smp_processor_id());
+
+	rcu_read_lock();
+	params = rcu_dereference(model->params);
+	current_base = params->base;
+	rcu_read_unlock();
+
+	if (block_size <= LM_BLOCK_SIZE_THRESHOLD) {
+		// Handle small requests
+		bucket_index = lm_input_bucket_index(latency, current_base ?: 1);
+
+		if (bucket_index >= LM_LAT_BUCKET_COUNT)
+			bucket_index = LM_LAT_BUCKET_COUNT - 1;
+
+		buckets->small_bucket[bucket_index].sum_of_weights += weight;
+		buckets->small_bucket[bucket_index].weighted_sum_latency +=
+			latency * weight;
+
+		local_irq_restore(flags);
+
+		if (unlikely(!current_base)) {
+			latency_model_update(ad, model);
+			return;
+		}
+	} else {
+		// Handle large requests
+		if (!current_base || !pred_lat) {
+			local_irq_restore(flags);
+			return;
+		}
+
+		bucket_index = lm_input_bucket_index(latency, pred_lat);
+
+		if (bucket_index >= LM_LAT_BUCKET_COUNT)
+			bucket_index = LM_LAT_BUCKET_COUNT - 1;
+
+		buckets->large_bucket[bucket_index].sum_of_weights += weight;
+		buckets->large_bucket[bucket_index].weighted_sum_latency +=
+			latency * weight;
+		buckets->large_bucket[bucket_index].weighted_sum_block_size +=
+			block_size * weight;
+
+		local_irq_restore(flags);
+	}
+}
+
+// Predict the latency for a given block size using the latency model
+static u64 latency_model_predict(struct latency_model *model, u32 block_size) {
+	u64 result;
+	struct latency_model_params *params;
+
+	rcu_read_lock();
+	params = rcu_dereference(model->params);
+
+	result = params->base;
+	if (block_size > LM_BLOCK_SIZE_THRESHOLD)
+		result += params->slope *
+			DIV_ROUND_UP_ULL(block_size - LM_BLOCK_SIZE_THRESHOLD, 1024);
+
+	rcu_read_unlock();
+
+	return result;
+}
+
+// Determine the type of operation based on request flags
+static u8 adios_optype(struct request *rq) {
+	switch (rq->cmd_flags & REQ_OP_MASK) {
+	case REQ_OP_READ:
+		return ADIOS_READ;
+	case REQ_OP_WRITE:
+		return ADIOS_WRITE;
+	case REQ_OP_DISCARD:
+		return ADIOS_DISCARD;
+	default:
+		return ADIOS_OTHER;
+	}
+}
+
+static inline u8 adios_optype_not_read(struct request *rq) {
+	return (rq->cmd_flags & REQ_OP_MASK) != REQ_OP_READ;
+}
+
+// Helper function to retrieve adios_rq_data from a request
+static inline struct adios_rq_data *get_rq_data(struct request *rq) {
+	return rq->elv.priv[0];
+}
+
+static inline
+void set_adios_state(struct adios_data *ad, u32 shift, u32 idx, bool flag) {
+	if (flag)
+		atomic_or(1U << (idx + shift), &ad->state);
+	else
+		atomic_andnot(1U << (idx + shift), &ad->state);
+}
+
+static inline u32 get_adios_state(struct adios_data *ad)
+{ return atomic_read(&ad->state); }
+
+static inline u32 eval_this_adios_state(u32 state, u32 shift)
+{ return (state >> shift) & 0x3; }
+
+static inline u32 eval_adios_state(struct adios_data *ad, u32 shift)
+{ return eval_this_adios_state(get_adios_state(ad), shift); }
+
+// Add a request to the deadline-sorted red-black tree
+static void add_to_dl_tree(
+		struct adios_data *ad, bool dl_idx, struct request *rq) {
+	struct rb_root_cached *root = &ad->dl_tree[dl_idx];
+	struct rb_node **link = &(root->rb_root.rb_node), *parent = NULL;
+	bool leftmost = true;
+	struct adios_rq_data *rd = get_rq_data(rq);
+	struct dl_group *dlg;
+	u64 deadline;
+	bool was_empty = RB_EMPTY_ROOT(&root->rb_root);
+
+	/* Tier-2: Synchronous Requests
+	 * - Needs to be FIFO within a same optype
+	 * - Relaxed order between different optypes
+	 * - basically needs to be processed in early time */
+	rd->deadline = rq->start_time_ns;
+
+	/* Tier-3: Aynchronous Requests
+	 * - Can be reordered and delayed freely */
+	if (!(rq->cmd_flags & REQ_SYNC)) {
+		rd->deadline += ad->latency_target[adios_optype(rq)];
+		if (!compliant(ad, ADIOS_CF_FIXORDER))
+			rd->deadline += rd->pred_lat;
+	}
+
+	// Now quantize the deadline (-> dlg->deadline == RB-Tree key)
+	deadline = rd->deadline & ~((1ULL << ADIOS_QUANTUM_SHIFT) - 1);
+
+	while (*link) {
+		dlg = rb_entry(*link, struct dl_group, node);
+		s64 diff = deadline - dlg->deadline;
+
+		parent = *link;
+		if (diff < 0) {
+			link = &((*link)->rb_left);
+		} else if (diff > 0) {
+			link = &((*link)->rb_right);
+			leftmost = false;
+		} else { // diff == 0
+			goto found;
+		}
+	}
+
+	dlg = rb_entry_safe(parent, struct dl_group, node);
+	if (!dlg || dlg->deadline != deadline) {
+		dlg = kmem_cache_zalloc(ad->dl_group_pool, GFP_ATOMIC);
+		if (!dlg)
+			return;
+		dlg->deadline = deadline;
+		INIT_LIST_HEAD(&dlg->rqs);
+		rb_link_node(&dlg->node, parent, link);
+		rb_insert_color_cached(&dlg->node, root, leftmost);
+	}
+found:
+	list_add_tail(&rd->dl_node, &dlg->rqs);
+	rd->dl_group = &dlg->rqs;
+
+	if (was_empty)
+		set_adios_state(ad, ADIOS_STATE_DL, dl_idx, true);
+}
+
+// Remove a request from the deadline-sorted red-black tree
+static void del_from_dl_tree(
+		struct adios_data *ad, bool dl_idx, struct request *rq) {
+	struct rb_root_cached *root = &ad->dl_tree[dl_idx];
+	struct adios_rq_data *rd = get_rq_data(rq);
+	struct dl_group *dlg = container_of(rd->dl_group, struct dl_group, rqs);
+
+	list_del_init(&rd->dl_node);
+	if (list_empty(&dlg->rqs)) {
+		rb_erase_cached(&dlg->node, root);
+		kmem_cache_free(ad->dl_group_pool, dlg);
+	}
+	rd->dl_group = NULL;
+
+	if (RB_EMPTY_ROOT(&ad->dl_tree[dl_idx].rb_root))
+		set_adios_state(ad, ADIOS_STATE_DL, dl_idx, false);
+}
+
+// Remove a request from the scheduler
+static void remove_request(struct adios_data *ad, struct request *rq) {
+	bool dl_idx = adios_optype_not_read(rq);
+	struct request_queue *q = rq->q;
+	struct adios_rq_data *rd = get_rq_data(rq);
+
+	list_del_init(&rq->queuelist);
+
+	// We might not be on the rbtree, if we are doing an insert merge
+	if (rd->dl_group)
+		del_from_dl_tree(ad, dl_idx, rq);
+
+	elv_rqhash_del(q, rq);
+	if (q->last_merge == rq)
+		q->last_merge = NULL;
+}
+
+// Convert a queue depth to the corresponding word depth for shallow allocation
+static int to_word_depth(struct blk_mq_hw_ctx *hctx, unsigned int qdepth) {
+	struct sbitmap_queue *bt = &hctx->sched_tags->bitmap_tags;
+	const unsigned int nrr = hctx->queue->nr_requests;
+
+	return ((qdepth << bt->sb.shift) + nrr - 1) / nrr;
+}
+
+// We limit the depth of request allocation for asynchronous and write requests
+static void adios_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) {
+	struct adios_data *ad = data->q->elevator->elevator_data;
+
+	// Do not throttle synchronous reads
+	if (op_is_sync(opf) && !op_is_write(opf))
+		return;
+
+	data->shallow_depth = to_word_depth(data->hctx, ad->async_depth);
+}
+
+// The number of requests in the queue was notified from the block layer
+static void adios_depth_updated(struct request_queue *q) {
+	struct adios_data *ad = q->elevator->elevator_data;
+
+	ad->async_depth = q->nr_requests;
+	blk_mq_set_min_shallow_depth(q, 1);
+}
+
+// Handle request merging after a merge operation
+static void adios_request_merged(struct request_queue *q, struct request *req,
+				  enum elv_merge type) {
+	bool dl_idx = adios_optype_not_read(req);
+	struct adios_data *ad = q->elevator->elevator_data;
+
+	// Reposition request in the deadline-sorted tree
+	del_from_dl_tree(ad, dl_idx, req);
+	add_to_dl_tree(ad, dl_idx, req);
+}
+
+// Handle merging of requests after one has been merged into another
+static void adios_merged_requests(struct request_queue *q, struct request *req,
+				   struct request *next) {
+	struct adios_data *ad = q->elevator->elevator_data;
+
+	lockdep_assert_held(&ad->lock);
+
+	// kill knowledge of next, this one is a goner
+	remove_request(ad, next);
+}
+
+// Try to merge a bio into an existing rq before associating it with an rq
+static bool adios_bio_merge(struct request_queue *q, struct bio *bio,
+		unsigned int nr_segs) {
+	unsigned long flags;
+	struct adios_data *ad = q->elevator->elevator_data;
+	struct request *free = NULL;
+	bool ret;
+
+	if (eval_adios_state(ad, ADIOS_STATE_BP))
+		return false;
+
+	if (!spin_trylock_irqsave(&ad->lock, flags))
+		return false;
+
+	ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free);
+	spin_unlock_irqrestore(&ad->lock, flags);
+
+	if (free)
+		blk_mq_free_request(free);
+
+	return ret;
+}
+
+static bool merge_or_insert_to_dl_tree(struct adios_data *ad,
+		struct request *rq, struct request_queue *q, struct list_head *free) {
+	if (blk_mq_sched_try_insert_merge(q, rq, free))
+		return true;
+
+	bool dl_idx = adios_optype_not_read(rq);
+	add_to_dl_tree(ad, dl_idx, rq);
+
+	if (rq_mergeable(rq)) {
+		elv_rqhash_add(q, rq);
+		if (!q->last_merge)
+			q->last_merge = rq;
+	}
+
+	return false;
+}
+
+static void insert_to_prio_queue(struct adios_data *ad,
+		struct request *rq, bool pq_idx) {
+	struct adios_rq_data *rd = get_rq_data(rq);
+
+	/* We're sure that rd->managed == true */
+	union adios_in_flight_rqs ifr = {
+		.count          = 1,
+		.total_pred_lat = rd->pred_lat,
+	};
+	atomic64_add(ifr.scalar, &ad->in_flight_rqs.atomic);
+
+	scoped_guard(spinlock_irqsave, &ad->pq_lock) {
+		bool was_empty = list_empty(&ad->prio_queue[pq_idx]);
+		list_add_tail(&rq->queuelist, &ad->prio_queue[pq_idx]);
+		if (was_empty)
+			set_adios_state(ad, ADIOS_STATE_PQ, pq_idx, true);
+	}
+}
+
+// Insert a request into the scheduler (after Read & Write models stabilized)
+static void insert_request_post_stability(struct blk_mq_hw_ctx *hctx,
+		struct request *rq, blk_insert_t insert_flags, struct list_head *free) {
+	struct request_queue *q = hctx->queue;
+	struct adios_data *ad = q->elevator->elevator_data;
+	struct adios_rq_data *rd = get_rq_data(rq);
+	u8 optype = adios_optype(rq);
+	bool rq_is_flush;
+
+	rd->managed = true;
+	rd->block_size = blk_rq_bytes(rq);
+	rd->pred_lat =
+		latency_model_predict(&ad->latency_model[optype], rd->block_size);
+
+	/* Tier-0: BLK_MQ_INSERT_AT_HEAD Requests */
+	if (insert_flags & BLK_MQ_INSERT_AT_HEAD) {
+		insert_to_prio_queue(ad, rq, 0);
+		return;
+	}
+
+	/*
+	 * Strict Barrier Handling for REQ_OP_FLUSH:
+	 * If a flush request arrives, or if the scheduler is already in a
+	 * barrier-pending state, all subsequent requests are diverted to a
+	 * separate barrier_queue. This ensures that no new requests are processed
+	 * until all work preceding the barrier is complete.
+	 */
+	rq_is_flush = rq->cmd_flags & REQ_OP_FLUSH;
+	if (eval_adios_state(ad, ADIOS_STATE_BP) || rq_is_flush) {
+		scoped_guard(spinlock_irqsave, &ad->barrier_lock) {
+			if (rq_is_flush)
+				set_adios_state(ad, ADIOS_STATE_BP, 0, true);
+			list_add_tail(&rq->queuelist, &ad->barrier_queue);
+		}
+		return;
+	}
+
+	if (merge_or_insert_to_dl_tree(ad, rq, q, free))
+		return;
+}
+
+// Insert a request into the scheduler (before Read & Write models stabilizes)
+static void insert_request_pre_stability(struct blk_mq_hw_ctx *hctx,
+		struct request *rq, blk_insert_t insert_flags, struct list_head *free) {
+	struct adios_data *ad = hctx->queue->elevator->elevator_data;
+	struct adios_rq_data *rd = get_rq_data(rq);
+	u8 optype = adios_optype(rq);
+	u8 pq_idx = !(insert_flags & BLK_MQ_INSERT_AT_HEAD);
+	bool models_stable = false;
+
+	rd->managed = true;
+	rd->block_size = blk_rq_bytes(rq);
+	rd->pred_lat =
+		latency_model_predict(&ad->latency_model[optype], rd->block_size);
+
+	insert_to_prio_queue(ad, rq, pq_idx);
+
+	rcu_read_lock();
+	if (rcu_dereference(ad->latency_model[ADIOS_READ].params)->base > 0 &&
+		rcu_dereference(ad->latency_model[ADIOS_WRITE].params)->base > 0)
+			models_stable = true;
+	rcu_read_unlock();
+
+	if (models_stable)
+		ad->insert_request_fn = insert_request_post_stability;
+}
+
+// Insert multiple requests into the scheduler
+static void adios_insert_requests(struct blk_mq_hw_ctx *hctx,
+				   struct list_head *list,
+				   blk_insert_t insert_flags) {
+	struct request_queue *q = hctx->queue;
+	struct adios_data *ad = q->elevator->elevator_data;
+	struct request *rq;
+	bool stop = false;
+	LIST_HEAD(free);
+
+	do {
+	scoped_guard(spinlock_irqsave, &ad->lock)
+	for (int i = 0; i < ADIOS_MAX_INSERTS_PER_LOCK; i++) {
+		if (list_empty(list)) {
+			stop = true;
+			break;
+		}
+		rq = list_first_entry(list, struct request, queuelist);
+		list_del_init(&rq->queuelist);
+		ad->insert_request_fn(hctx, rq, insert_flags, &free);
+	}} while (!stop);
+
+	blk_mq_free_requests(&free);
+}
+
+// Prepare a request before it is inserted into the scheduler
+static void adios_prepare_request(struct request *rq) {
+	struct adios_data *ad = rq->q->elevator->elevator_data;
+	struct adios_rq_data *rd = get_rq_data(rq);
+
+	rq->elv.priv[0] = NULL;
+
+	/* Allocate adios_rq_data from the memory pool */
+	rd = kmem_cache_zalloc(ad->rq_data_pool, GFP_ATOMIC);
+	if (WARN(!rd, "adios_prepare_request: "
+			"Failed to allocate memory from rq_data_pool. rd is NULL\n"))
+		return;
+
+	rd->rq = rq;
+	rq->elv.priv[0] = rd;
+}
+
+static struct adios_rq_data *get_dl_first_rd(struct adios_data *ad, bool idx) {
+	struct rb_root_cached *root = &ad->dl_tree[idx];
+	struct rb_node *first = rb_first_cached(root);
+	struct dl_group *dl_group = rb_entry(first, struct dl_group, node);
+
+	return list_first_entry(&dl_group->rqs, struct adios_rq_data, dl_node);
+}
+
+// Comparison function for sorting requests by block address
+static int cmp_rq_pos(void *priv,
+		const struct list_head *a, const struct list_head *b) {
+	struct request *rq_a = list_entry(a, struct request, queuelist);
+	struct request *rq_b = list_entry(b, struct request, queuelist);
+	u64 pos_a = blk_rq_pos(rq_a);
+	u64 pos_b = blk_rq_pos(rq_b);
+
+	return (int)(pos_a > pos_b) - (int)(pos_a < pos_b);
+}
+
+#ifndef list_last_entry_or_null
+#define list_last_entry_or_null(ptr, type, member) \
+	(!list_empty(ptr) ? list_last_entry(ptr, type, member) : NULL)
+#endif
+
+// Update the elevator direction
+static void update_elv_direction(struct adios_data *ad) {
+	if (!ad->is_rotational)
+		return;
+
+	bool page = ad->bq_page;
+	struct list_head *q = &ad->batch_queue[page][1];
+	if (ad->bq_batch_order[page] < ADIOS_BO_ELEVATOR || list_empty(q)) {
+		ad->elv_direction = 0;
+		return;
+	}
+
+	// Get first and last request positions in the queue
+	struct request *rq_a = list_first_entry(q, struct request, queuelist);
+	struct request *rq_b = list_last_entry (q, struct request, queuelist);
+	u64 pos_a = blk_rq_pos(rq_a);
+	u64 pos_b = blk_rq_pos(rq_b);
+	u64 avg_rq_pos = (pos_a + pos_b) >> 1;
+
+	ad->elv_direction = !!(ad->head_pos > avg_rq_pos);
+}
+
+// Fill the batch queues with requests from the deadline-sorted red-black tree
+static bool fill_batch_queues(struct adios_data *ad, u64 tpl) {
+	struct adios_rq_data *rd;
+	struct request *rq;
+	struct list_head *dest_q;
+	u8  dest_idx;
+	u64 added_lat = 0;
+	u32 optype_count[ADIOS_OPTYPES] = {0};
+	u32 count = 0;
+	u8 optype;
+	bool page = !ad->bq_page, dl_idx, bias_idx, update_bias;
+	u32 dl_queued;
+	u8 bq_batch_order;
+	bool stop = false;
+
+	// Reset batch queue counts for the back page
+	memset(&ad->batch_count[page], 0, sizeof(ad->batch_count[page]));
+
+	ad->bq_batch_order[page] =
+		bq_batch_order = ad->batch_order;
+
+	do {
+	scoped_guard(spinlock_irqsave, &ad->lock)
+	for (int i = 0; i < ADIOS_MAX_DELETES_PER_LOCK; i++) {
+		bool has_base = false;
+
+		dl_queued = eval_adios_state(ad, ADIOS_STATE_DL);
+		// Check if there are any requests queued in the deadline tree
+		if (!dl_queued) {
+			stop = true;
+			break;
+		}
+
+		// Reads if both queues have requests, otherwise pick the non-empty.
+		dl_idx = dl_queued >> 1;
+
+		// Get the first request from the deadline-sorted tree
+		rd = get_dl_first_rd(ad, dl_idx);
+
+		bias_idx = ad->dl_bias < 0;
+		// If read and write requests are queued, choose one based on bias
+		if (dl_queued == 0x3) {
+			struct adios_rq_data *trd[2] = {get_dl_first_rd(ad, 0), rd};
+			rd = trd[bias_idx];
+
+			update_bias = (trd[bias_idx]->deadline > trd[!bias_idx]->deadline);
+		} else
+			update_bias = (bias_idx == dl_idx);
+
+		rq = rd->rq;
+		optype = adios_optype(rq);
+
+		rcu_read_lock();
+		has_base =
+			!!rcu_dereference(ad->latency_model[optype].params)->base;
+		rcu_read_unlock();
+
+		// Check batch size and total predicted latency
+		if (count && (!has_base ||
+				ad->batch_count[page][optype] >= ad->batch_limit[optype] ||
+				(tpl + added_lat + rd->pred_lat) > ad->global_latency_window)) {
+			stop = true;
+			break;
+		}
+
+		if (update_bias) {
+			s64 sign = ((s64)bias_idx << 1) - 1;
+			if (unlikely(!rd->pred_lat))
+				ad->dl_bias = sign;
+			else
+				// Adjust the bias based on the predicted latency
+				ad->dl_bias += sign * (s64)((rd->pred_lat *
+					adios_prio_to_wmult[ad->dl_prio[bias_idx] + 20]) >> 10);
+		}
+
+		remove_request(ad, rq);
+
+		// Add request to the corresponding batch queue
+		dest_idx = (bq_batch_order == ADIOS_BO_OPTYPE || optype == ADIOS_OTHER)?
+			optype : !!(rd->deadline != rq->start_time_ns);
+		dest_q = &ad->batch_queue[page][dest_idx];
+		list_add_tail(&rq->queuelist, dest_q);
+		ad->bq_state[page] |= 1U << dest_idx;
+		ad->batch_count[page][optype]++;
+		optype_count[optype]++;
+		added_lat += rd->pred_lat;
+		count++;
+	}} while (!stop);
+
+	if (bq_batch_order == ADIOS_BO_ELEVATOR && ad->batch_count[page][1] > 1)
+			list_sort(NULL, &ad->batch_queue[page][1], cmp_rq_pos);
+
+	if (count) {
+		/* We're sure that every request's rd->managed == true */
+		union adios_in_flight_rqs ifr = {
+			.count          = count,
+			.total_pred_lat = added_lat,
+		};
+		atomic64_add(ifr.scalar, &ad->in_flight_rqs.atomic);
+
+		set_adios_state(ad, ADIOS_STATE_BQ, page, true);
+
+		for (optype = 0; optype < ADIOS_OPTYPES; optype++)
+			if (ad->batch_actual_max_size[optype] < optype_count[optype])
+				ad->batch_actual_max_size[optype] = optype_count[optype];
+		if (ad->batch_actual_max_total < count)
+			ad->batch_actual_max_total = count;
+	}
+	return count;
+}
+
+// Flip to the next batch queue page
+static void flip_bq_page(struct adios_data *ad) {
+	ad->bq_page = !ad->bq_page;
+	update_elv_direction(ad);
+}
+
+// Pop a request from the specified index (optype or elevator tier)
+static inline struct request *pop_bq_request(
+		struct adios_data *ad, u8 idx, bool direction) {
+	bool page = ad->bq_page;
+	struct list_head *q = &ad->batch_queue[page][idx];
+	struct request *rq = direction ?
+		list_last_entry_or_null (q, struct request, queuelist):
+		list_first_entry_or_null(q, struct request, queuelist);
+	if (rq) {
+		list_del_init(&rq->queuelist);
+		if (list_empty(q))
+			ad->bq_state[page] &= ~(1U << idx);
+	}
+	return rq;
+}
+
+static struct request *pop_next_bq_request_optype(struct adios_data *ad) {
+	u32 bq_state = ad->bq_state[ad->bq_page];
+	if (!bq_state) return NULL;
+
+	struct request *rq;
+	u32 bq_idx = __builtin_ctz(bq_state);
+
+	// Dispatch based on optype (FIFO within each) or single-queue elevator
+	rq = pop_bq_request(ad, bq_idx, false);
+	return rq;
+}
+
+static struct request *pop_next_bq_request_elevator(struct adios_data *ad) {
+	u32 bq_state = ad->bq_state[ad->bq_page];
+	if (!bq_state) return NULL;
+
+	struct request *rq;
+	u32 bq_idx = __builtin_ctz(bq_state);
+	bool direction = (bq_idx == 1) & ad->elv_direction;
+
+	// Tier-2 (sync) is always high priority
+	// Tier-3 (async) uses the pre-calculated elevator direction
+	rq = pop_bq_request(ad, bq_idx, direction);
+
+	/* If batch queue for the sync requests just became empty */
+	if (bq_idx == 0 && rq && !(bq_state & 0x1))
+		update_elv_direction(ad);
+
+	return rq;
+}
+
+// Returns the state of the batch queue page
+static inline bool bq_page_has_rq(u32 bq_state, bool page) {
+	return bq_state & (1U << page);
+}
+
+// Dispatch a request from the batch queues
+static struct request *dispatch_from_bq(struct adios_data *ad) {
+	struct request *rq;
+
+	guard(spinlock_irqsave)(&ad->bq_lock);
+
+	u32 state = get_adios_state(ad);
+	u32 bq_state = eval_this_adios_state(state, ADIOS_STATE_BQ);
+	u32 bq_curr_page_has_rq = bq_page_has_rq(bq_state, ad->bq_page);
+	union adios_in_flight_rqs ifr;
+	ifr.scalar = atomic64_read(&ad->in_flight_rqs.atomic);
+	u64 tpl = ifr.total_pred_lat;
+
+	// Refill the batch queues if the back page is empty, dl_tree has work, and
+	// current page is empty or the total ongoing latency is below the threshold
+	if (!bq_page_has_rq(bq_state, !ad->bq_page) &&
+			(!bq_curr_page_has_rq || (!tpl || tpl < div_u64(
+			ad->global_latency_window * ad->bq_refill_below_ratio, 100))) &&
+			eval_this_adios_state(state, ADIOS_STATE_DL))
+		fill_batch_queues(ad, tpl);
+
+	// If current batch queue page is empty, and the other page has work, flip
+	if (!bq_curr_page_has_rq &&
+			bq_page_has_rq(eval_adios_state(ad, ADIOS_STATE_BQ), !ad->bq_page))
+		flip_bq_page(ad);
+
+	// Use the per-page state to decide the dispatch logic, ensuring correctness
+	rq = (ad->bq_batch_order[ad->bq_page] == ADIOS_BO_ELEVATOR) ?
+		pop_next_bq_request_elevator(ad):
+		pop_next_bq_request_optype(ad);
+
+	if (rq) {
+		bool page = ad->bq_page;
+		bool is_empty = !ad->bq_state[page];
+		if (is_empty)
+			set_adios_state(ad, ADIOS_STATE_BQ, page, false);
+		return rq;
+	}
+
+	return NULL;
+}
+
+// Dispatch a request from the priority queue
+static struct request *dispatch_from_pq(struct adios_data *ad) {
+	struct request *rq = NULL;
+
+	guard(spinlock_irqsave)(&ad->pq_lock);
+	u32 pq_state = eval_adios_state(ad, ADIOS_STATE_PQ);
+	u8  pq_idx = pq_state >> 1;
+	struct list_head *q = &ad->prio_queue[pq_idx];
+
+	if (unlikely(list_empty(q))) return NULL;
+
+	rq = list_first_entry(q, struct request, queuelist);
+	list_del_init(&rq->queuelist);
+	if (list_empty(q)) {
+		set_adios_state(ad, ADIOS_STATE_PQ, pq_idx, false);
+		update_elv_direction(ad);
+	}
+	return rq;
+}
+
+static bool release_barrier_requests(struct adios_data *ad) {
+	u32 moved_count = 0;
+	LIST_HEAD(local_list);
+
+	scoped_guard(spinlock_irqsave, &ad->barrier_lock) {
+		if (!list_empty(&ad->barrier_queue)) {
+			struct request *trq, *next;
+			bool first_barrier_moved = false;
+
+			list_for_each_entry_safe(trq, next, &ad->barrier_queue, queuelist) {
+				if (!first_barrier_moved) {
+					list_del_init(&trq->queuelist);
+					insert_to_prio_queue(ad, trq, 1);
+					moved_count++;
+					first_barrier_moved = true;
+					continue;
+				}
+
+				if (trq->cmd_flags & REQ_OP_FLUSH)
+					break;
+
+				list_move_tail(&trq->queuelist, &local_list);
+				moved_count++;
+			}
+
+			if (list_empty(&ad->barrier_queue))
+				set_adios_state(ad, ADIOS_STATE_BP, 0, false);
+		}
+	}
+
+	if (!moved_count)
+		return false;
+
+	if (!list_empty(&local_list)) {
+		struct request *trq, *next;
+		LIST_HEAD(free_list);
+
+		/* ad->lock is already held */
+		list_for_each_entry_safe(trq, next, &local_list, queuelist) {
+			list_del_init(&trq->queuelist);
+			if (merge_or_insert_to_dl_tree(ad, trq, ad->queue, &free_list))
+				continue;
+		}
+
+		if (!list_empty(&free_list))
+			blk_mq_free_requests(&free_list);
+	}
+
+	return true;
+}
+
+// Dispatch a request to the hardware queue
+static struct request *adios_dispatch_request(struct blk_mq_hw_ctx *hctx) {
+	struct adios_data *ad = hctx->queue->elevator->elevator_data;
+	struct request *rq;
+
+retry:
+	rq = dispatch_from_pq(ad);
+	if (rq)
+		goto found;
+
+	rq = dispatch_from_bq(ad);
+	if (rq)
+		goto found;
+
+	/*
+	 * If all active queues are empty, check if we need to process a barrier.
+	 * This is the trigger to release requests that were held in barrier_queue
+	 * due to a REQ_OP_FLUSH barrier.
+	 */
+	if (eval_adios_state(ad, ADIOS_STATE_BP)) {
+		union adios_in_flight_rqs ifr;
+		ifr.scalar = atomic64_read(&ad->in_flight_rqs.atomic);
+		if (!ifr.count) {
+			bool barrier_released = false;
+			scoped_guard(spinlock_irqsave, &ad->lock)
+				barrier_released = release_barrier_requests(ad);
+			if (barrier_released)
+				goto retry;
+		}
+	}
+
+	return NULL;
+found:
+	if (ad->is_rotational)
+		ad->head_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
+
+	rq->rq_flags |= RQF_STARTED;
+	return rq;
+}
+
+// Timer callback function to periodically update latency models
+static void update_timer_callback(struct timer_list *t) {
+	struct adios_data *ad = timer_container_of(ad, t, update_timer);
+
+	for (u8 optype = 0; optype < ADIOS_OPTYPES; optype++)
+		latency_model_update(ad, &ad->latency_model[optype]);
+}
+
+// Handle the completion of a request
+static void adios_completed_request(struct request *rq, u64 now) {
+	struct adios_data *ad = rq->q->elevator->elevator_data;
+	struct adios_rq_data *rd = get_rq_data(rq);
+	union adios_in_flight_rqs ifr = { .scalar = 0 };
+
+	if (rd->managed) {
+		union adios_in_flight_rqs ifr_to_sub = {
+			.count          = 1,
+			.total_pred_lat = rd->pred_lat,
+		};
+		ifr.scalar = atomic64_sub_return(
+			ifr_to_sub.scalar, &ad->in_flight_rqs.atomic);
+	}
+	u8 optype = adios_optype(rq);
+
+	if (optype == ADIOS_OTHER) {
+		// Non-positional commands make the head position unpredictable.
+		// Invalidate our knowledge of the last completed position.
+		if (ad->is_rotational)
+			ad->last_completed_pos = 0;
+		return;
+	}
+
+	u64 lct = ad->last_completed_time ?: rq->io_start_time_ns;
+	ad->last_completed_time = (ifr.count) ? now : 0;
+
+	if (!rq->io_start_time_ns || !rd->block_size || unlikely(now < lct))
+		return;
+
+	u64 latency = now - lct;
+	if (latency > ad->lat_model_latency_limit)
+		return;
+
+	u32 weight = 1;
+	if (ad->is_rotational) {
+		sector_t current_pos = blk_rq_pos(rq);
+		// Only calculate seek distance if we have a valid last position.
+		if (ad->last_completed_pos > 0) {
+			u64 seek_distance = abs(
+				(s64)current_pos - (s64)ad->last_completed_pos);
+			weight = 65 - __builtin_clzll(seek_distance);
+		}
+		// Update (or re-synchronize) our knowledge of the head position.
+		ad->last_completed_pos = current_pos + blk_rq_sectors(rq);
+	}
+
+	latency_model_input(ad, &ad->latency_model[optype],
+		rd->block_size, latency, rd->pred_lat, weight);
+	timer_reduce(&ad->update_timer, jiffies + msecs_to_jiffies(100));
+}
+
+// Clean up after a request is finished
+static void adios_finish_request(struct request *rq) {
+	struct adios_data *ad = rq->q->elevator->elevator_data;
+
+	if (rq->elv.priv[0]) {
+		// Free adios_rq_data back to the memory pool
+		kmem_cache_free(ad->rq_data_pool, get_rq_data(rq));
+		rq->elv.priv[0] = NULL;
+	}
+}
+
+// Check if there are any requests available for dispatch
+static bool adios_has_work(struct blk_mq_hw_ctx *hctx) {
+	struct adios_data *ad = hctx->queue->elevator->elevator_data;
+
+	return atomic_read(&ad->state) != 0;
+}
+
+// Initialize the scheduler-specific data when initializing the request queue
+static int adios_init_sched(struct request_queue *q, struct elevator_queue *eq) {
+	struct adios_data *ad;
+	int ret = -ENOMEM;
+	u8 optype = 0;
+
+	ad = kzalloc_node(sizeof(*ad), GFP_KERNEL, q->node);
+	if (!ad) {
+		pr_err("adios: Failed to create adios_data\n");
+		goto put_eq;
+	}
+
+	eq->elevator_data = ad;
+
+	// Create a memory pool for adios_rq_data
+	ad->rq_data_pool = kmem_cache_create("rq_data_pool",
+						sizeof(struct adios_rq_data),
+						0, SLAB_HWCACHE_ALIGN, NULL);
+	if (!ad->rq_data_pool) {
+		pr_err("adios: Failed to create rq_data_pool\n");
+		goto free_ad;
+	}
+
+	/* Create a memory pool for dl_group */
+	ad->dl_group_pool = kmem_cache_create("dl_group_pool",
+						sizeof(struct dl_group),
+						0, SLAB_HWCACHE_ALIGN, NULL);
+	if (!ad->dl_group_pool) {
+		pr_err("adios: Failed to create dl_group_pool\n");
+		goto destroy_rq_data_pool;
+	}
+
+	for (int i = 0; i < ADIOS_PQ_LEVELS; i++)
+		INIT_LIST_HEAD(&ad->prio_queue[i]);
+
+	for (u8 i = 0; i < ADIOS_DL_TYPES; i++) {
+		ad->dl_tree[i] = RB_ROOT_CACHED;
+		ad->dl_prio[i] = default_dl_prio[i];
+	}
+	ad->dl_bias = 0;
+
+	for (u8 page = 0; page < ADIOS_BQ_PAGES; page++)
+		for (optype = 0; optype < ADIOS_OPTYPES; optype++)
+			INIT_LIST_HEAD(&ad->batch_queue[page][optype]);
+
+	ad->aggr_buckets = kzalloc(sizeof(*ad->aggr_buckets), GFP_KERNEL);
+	if (!ad->aggr_buckets) {
+		pr_err("adios: Failed to allocate aggregation buckets\n");
+		goto destroy_dl_group_pool;
+	}
+
+	for (optype = 0; optype < ADIOS_OPTYPES; optype++) {
+		struct latency_model *model = &ad->latency_model[optype];
+		struct latency_model_params *params;
+
+		spin_lock_init(&model->update_lock);
+		params = kzalloc(sizeof(*params), GFP_KERNEL);
+		if (!params) {
+			pr_err("adios: Failed to allocate latency_model_params\n");
+			goto free_buckets;
+		}
+		params->last_update_jiffies = jiffies;
+		RCU_INIT_POINTER(model->params, params);
+
+		model->pcpu_buckets = alloc_percpu(struct lm_buckets);
+		if (!model->pcpu_buckets) {
+			pr_err("adios: Failed to allocate per-CPU buckets\n");
+			kfree(params);
+			goto free_buckets;
+		}
+
+		model->lm_shrink_at_kreqs  = default_lm_shrink_at_kreqs;
+		model->lm_shrink_at_gbytes = default_lm_shrink_at_gbytes;
+		model->lm_shrink_resist    = default_lm_shrink_resist;
+	}
+
+	for (optype = 0; optype < ADIOS_OPTYPES; optype++) {
+		ad->latency_target[optype] = default_latency_target[optype];
+		ad->batch_limit[optype] = default_batch_limit[optype];
+	}
+
+	eq->elevator_data = ad;
+
+	ad->is_rotational = !!(q->limits.features & BLK_FEAT_ROTATIONAL);
+	ad->global_latency_window = (ad->is_rotational)?
+		default_global_latency_window_rotational:
+		default_global_latency_window;
+	ad->bq_refill_below_ratio = default_bq_refill_below_ratio;
+	ad->lat_model_latency_limit = default_lat_model_latency_limit;
+	ad->batch_order = default_batch_order;
+	ad->compliance_flags = default_compliance_flags;
+
+	ad->insert_request_fn = insert_request_pre_stability;
+
+	atomic_set(&ad->state, 0);
+
+	spin_lock_init(&ad->lock);
+	spin_lock_init(&ad->pq_lock);
+	spin_lock_init(&ad->bq_lock);
+	spin_lock_init(&ad->barrier_lock);
+	INIT_LIST_HEAD(&ad->barrier_queue);
+
+	timer_setup(&ad->update_timer, update_timer_callback, 0);
+
+	/* We dispatch from request queue wide instead of hw queue */
+	blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q);
+
+	ad->queue = q;
+	blk_stat_enable_accounting(q);
+
+	q->elevator = eq;
+	adios_depth_updated(q);
+	return 0;
+
+free_buckets:
+	pr_err("adios: Failed to allocate per-cpu buckets\n");
+	while (optype-- > 0) {
+		struct latency_model *prev_model = &ad->latency_model[optype];
+		kfree(rcu_access_pointer(prev_model->params));
+		free_percpu(prev_model->pcpu_buckets);
+	}
+	kfree(ad->aggr_buckets);
+destroy_dl_group_pool:
+	kmem_cache_destroy(ad->dl_group_pool);
+destroy_rq_data_pool:
+	kmem_cache_destroy(ad->rq_data_pool);
+free_ad:
+	kfree(ad);
+put_eq:
+	kobject_put(&eq->kobj);
+	return ret;
+}
+
+// Clean up and free resources when exiting the scheduler
+static void adios_exit_sched(struct elevator_queue *e) {
+	struct adios_data *ad = e->elevator_data;
+
+	timer_shutdown_sync(&ad->update_timer);
+
+	WARN_ON_ONCE(!list_empty(&ad->barrier_queue));
+	for (int i = 0; i < 2; i++)
+		WARN_ON_ONCE(!list_empty(&ad->prio_queue[i]));
+
+	for (u8 i = 0; i < ADIOS_OPTYPES; i++) {
+		struct latency_model *model = &ad->latency_model[i];
+		struct latency_model_params *params = rcu_access_pointer(model->params);
+
+		RCU_INIT_POINTER(model->params, NULL);
+		kfree_rcu(params, rcu);
+
+		free_percpu(model->pcpu_buckets);
+	}
+
+	synchronize_rcu();
+
+	kfree(ad->aggr_buckets);
+
+	if (ad->rq_data_pool)
+		kmem_cache_destroy(ad->rq_data_pool);
+
+	if (ad->dl_group_pool)
+		kmem_cache_destroy(ad->dl_group_pool);
+
+	blk_stat_disable_accounting(ad->queue);
+
+	kfree(ad);
+}
+
+static void sideload_latency_model(
+		struct latency_model *model, u64 base, u64 slope) {
+	struct latency_model_params *old_params, *new_params;
+	unsigned long flags;
+
+	new_params = kzalloc(sizeof(*new_params), GFP_KERNEL);
+	if (!new_params)
+		return;
+
+	spin_lock_irqsave(&model->update_lock, flags);
+
+	old_params = rcu_dereference_protected(model->params,
+			lockdep_is_held(&model->update_lock));
+
+	new_params->last_update_jiffies = jiffies;
+
+	// Initialize base and its statistics as a single sample.
+	new_params->base = base;
+	new_params->small_sum_delay = base;
+	new_params->small_count = 1;
+
+	// Initialize slope and its statistics as a single sample.
+	new_params->slope = slope;
+	new_params->large_sum_delay = slope;
+	new_params->large_sum_bsize = 1024; /* Corresponds to 1 KiB */
+
+	lm_reset_pcpu_buckets(model);
+
+	rcu_assign_pointer(model->params, new_params);
+	spin_unlock_irqrestore(&model->update_lock, flags);
+
+	kfree_rcu(old_params, rcu);
+}
+
+// Define sysfs attributes for operation types
+#define SYSFS_OPTYPE_DECL(name, optype) \
+static ssize_t adios_lat_model_##name##_show( \
+		struct elevator_queue *e, char *page) { \
+	struct adios_data *ad = e->elevator_data; \
+	struct latency_model *model = &ad->latency_model[optype]; \
+	struct latency_model_params *params; \
+	ssize_t len = 0; \
+	u64 base, slope; \
+	rcu_read_lock(); \
+	params = rcu_dereference(model->params); \
+	base = params->base; \
+	slope = params->slope; \
+	rcu_read_unlock(); \
+	len += sprintf(page,       "base : %llu ns\n", base); \
+	len += sprintf(page + len, "slope: %llu ns/KiB\n", slope); \
+	return len; \
+} \
+static ssize_t adios_lat_model_##name##_store( \
+		struct elevator_queue *e, const char *page, size_t count) { \
+	struct adios_data *ad = e->elevator_data; \
+	struct latency_model *model = &ad->latency_model[optype]; \
+	u64 base, slope; \
+	int ret; \
+	ret = sscanf(page, "%llu %llu", &base, &slope); \
+	if (ret != 2) \
+		return -EINVAL; \
+	sideload_latency_model(model, base, slope); \
+	reset_buckets(ad->aggr_buckets); \
+	return count; \
+} \
+static ssize_t adios_lat_target_##name##_show( \
+		struct elevator_queue *e, char *page) { \
+	struct adios_data *ad = e->elevator_data; \
+	return sprintf(page, "%llu\n", ad->latency_target[optype]); \
+} \
+static ssize_t adios_lat_target_##name##_store( \
+		struct elevator_queue *e, const char *page, size_t count) { \
+	struct adios_data *ad = e->elevator_data; \
+	unsigned long nsec; \
+	int ret; \
+	ret = kstrtoul(page, 10, &nsec); \
+	if (ret) \
+		return ret; \
+	sideload_latency_model(&ad->latency_model[optype], 0, 0); \
+	ad->latency_target[optype] = nsec; \
+	return count; \
+} \
+static ssize_t adios_batch_limit_##name##_show( \
+		struct elevator_queue *e, char *page) { \
+	struct adios_data *ad = e->elevator_data; \
+	return sprintf(page, "%u\n", ad->batch_limit[optype]); \
+} \
+static ssize_t adios_batch_limit_##name##_store( \
+		struct elevator_queue *e, const char *page, size_t count) { \
+	unsigned long max_batch; \
+	int ret; \
+	ret = kstrtoul(page, 10, &max_batch); \
+	if (ret || max_batch == 0) \
+		return -EINVAL; \
+	struct adios_data *ad = e->elevator_data; \
+	ad->batch_limit[optype] = max_batch; \
+	return count; \
+}
+
+SYSFS_OPTYPE_DECL(read, ADIOS_READ);
+SYSFS_OPTYPE_DECL(write, ADIOS_WRITE);
+SYSFS_OPTYPE_DECL(discard, ADIOS_DISCARD);
+
+// Show the maximum batch size actually achieved for each operation type
+static ssize_t adios_batch_actual_max_show(
+		struct elevator_queue *e, char *page) {
+	struct adios_data *ad = e->elevator_data;
+	u32 total_count, read_count, write_count, discard_count;
+
+	total_count = ad->batch_actual_max_total;
+	read_count = ad->batch_actual_max_size[ADIOS_READ];
+	write_count = ad->batch_actual_max_size[ADIOS_WRITE];
+	discard_count = ad->batch_actual_max_size[ADIOS_DISCARD];
+
+	return sprintf(page,
+		"Total  : %u\nDiscard: %u\nRead   : %u\nWrite  : %u\n",
+		total_count, discard_count, read_count, write_count);
+}
+
+#define SYSFS_ULL_DECL(field, min_val, max_val) \
+static ssize_t adios_##field##_show( \
+		struct elevator_queue *e, char *page) { \
+	struct adios_data *ad = e->elevator_data; \
+	return sprintf(page, "%llu\n", ad->field); \
+} \
+static ssize_t adios_##field##_store( \
+		struct elevator_queue *e, const char *page, size_t count) { \
+	struct adios_data *ad = e->elevator_data; \
+	unsigned long val; \
+	int ret; \
+	ret = kstrtoul(page, 10, &val); \
+	if (ret || val < (min_val) || val > (max_val)) \
+		return -EINVAL; \
+	ad->field = val; \
+	return count; \
+}
+
+SYSFS_ULL_DECL(global_latency_window, 0, ULLONG_MAX)
+SYSFS_ULL_DECL(compliance_flags, 0, ULLONG_MAX)
+
+#define SYSFS_INT_DECL(field, min_val, max_val) \
+static ssize_t adios_##field##_show( \
+		struct elevator_queue *e, char *page) { \
+	struct adios_data *ad = e->elevator_data; \
+	return sprintf(page, "%d\n", ad->field); \
+} \
+static ssize_t adios_##field##_store( \
+		struct elevator_queue *e, const char *page, size_t count) { \
+	struct adios_data *ad = e->elevator_data; \
+	int val; \
+	int ret; \
+	ret = kstrtoint(page, 10, &val); \
+	if (ret || val < (min_val) || val > (max_val)) \
+		return -EINVAL; \
+	ad->field = val; \
+	return count; \
+}
+
+SYSFS_INT_DECL(bq_refill_below_ratio, 0, 100)
+SYSFS_INT_DECL(lat_model_latency_limit, 0, 2*NSEC_PER_SEC)
+SYSFS_INT_DECL(batch_order, ADIOS_BO_OPTYPE, !!ad->is_rotational)
+
+// Show the read priority
+static ssize_t adios_read_priority_show(
+		struct elevator_queue *e, char *page) {
+	struct adios_data *ad = e->elevator_data;
+	return sprintf(page, "%d\n", ad->dl_prio[0]);
+}
+
+// Set the read priority
+static ssize_t adios_read_priority_store(
+		struct elevator_queue *e, const char *page, size_t count) {
+	struct adios_data *ad = e->elevator_data;
+	int prio;
+	int ret;
+
+	ret = kstrtoint(page, 10, &prio);
+	if (ret || prio < -20 || prio > 19)
+		return -EINVAL;
+
+	guard(spinlock_irqsave)(&ad->lock);
+	ad->dl_prio[0] = prio;
+	ad->dl_bias = 0;
+
+	return count;
+}
+
+// Reset batch queue statistics
+static ssize_t adios_reset_bq_stats_store(
+		struct elevator_queue *e, const char *page, size_t count) {
+	struct adios_data *ad = e->elevator_data;
+	unsigned long val;
+	int ret;
+
+	ret = kstrtoul(page, 10, &val);
+	if (ret || val != 1)
+		return -EINVAL;
+
+	for (u8 i = 0; i < ADIOS_OPTYPES; i++)
+		ad->batch_actual_max_size[i] = 0;
+
+	ad->batch_actual_max_total = 0;
+
+	return count;
+}
+
+// Reset the latency model parameters or load them from user input
+static ssize_t adios_reset_lat_model_store(
+		struct elevator_queue *e, const char *page, size_t count)
+{
+	struct adios_data *ad = e->elevator_data;
+	struct latency_model *model;
+	int ret;
+
+	/*
+	 * Differentiate between two modes based on input format:
+	 * 1. "1": Fully reset the model (backward compatibility).
+	 * 2. "R_base R_slope W_base W_slope D_base D_slope": Load values.
+	 */
+	if (!strchr(page, ' ')) {
+		// Mode 1: Full reset.
+		unsigned long val;
+
+		ret = kstrtoul(page, 10, &val);
+		if (ret || val != 1)
+			return -EINVAL;
+
+		for (u8 i = 0; i < ADIOS_OPTYPES; i++) {
+			model = &ad->latency_model[i];
+			sideload_latency_model(model, 0, 0);
+		}
+	} else {
+		// Mode 2: Load initial values for all latency models.
+		u64 params[3][2]; /* 0:base, 1:slope for R, W, D */
+
+		ret = sscanf(page, "%llu %llu %llu %llu %llu %llu",
+			&params[ADIOS_READ   ][0], &params[ADIOS_READ   ][1],
+			&params[ADIOS_WRITE  ][0], &params[ADIOS_WRITE  ][1],
+			&params[ADIOS_DISCARD][0], &params[ADIOS_DISCARD][1]);
+
+		if (ret != 6)
+			return -EINVAL;
+
+		for (u8 i = ADIOS_READ; i <= ADIOS_DISCARD; i++) {
+			model = &ad->latency_model[i];
+			sideload_latency_model(model, params[i][0], params[i][1]);
+		}
+	}
+	reset_buckets(ad->aggr_buckets);
+
+	return count;
+}
+
+// Show the ADIOS version
+static ssize_t adios_version_show(struct elevator_queue *e, char *page) {
+	return sprintf(page, "%s\n", ADIOS_VERSION);
+}
+
+// Define sysfs attributes for dynamic thresholds
+#define SHRINK_THRESHOLD_ATTR_RW(name, model_field, min_value, max_value) \
+static ssize_t adios_shrink_##name##_store( \
+		struct elevator_queue *e, const char *page, size_t count) { \
+	struct adios_data *ad = e->elevator_data; \
+	unsigned long val; \
+	int ret; \
+	ret = kstrtoul(page, 10, &val); \
+	if (ret || val < min_value || val > max_value) \
+		return -EINVAL; \
+	for (u8 i = 0; i < ADIOS_OPTYPES; i++) { \
+		struct latency_model *model = &ad->latency_model[i]; \
+		unsigned long flags; \
+		spin_lock_irqsave(&model->update_lock, flags); \
+		model->model_field = val; \
+		spin_unlock_irqrestore(&model->update_lock, flags); \
+	} \
+	return count; \
+} \
+static ssize_t adios_shrink_##name##_show( \
+		struct elevator_queue *e, char *page) { \
+	struct adios_data *ad = e->elevator_data; \
+	u32 val = 0; \
+	unsigned long flags; \
+	struct latency_model *model = &ad->latency_model[0]; \
+	spin_lock_irqsave(&model->update_lock, flags); \
+	val = model->model_field; \
+	spin_unlock_irqrestore(&model->update_lock, flags); \
+	return sprintf(page, "%u\n", val); \
+}
+
+SHRINK_THRESHOLD_ATTR_RW(at_kreqs,  lm_shrink_at_kreqs,  1, 100000)
+SHRINK_THRESHOLD_ATTR_RW(at_gbytes, lm_shrink_at_gbytes, 1,   1000)
+SHRINK_THRESHOLD_ATTR_RW(resist,    lm_shrink_resist,    1,      3)
+
+// Define sysfs attributes
+#define AD_ATTR(name, show_func, store_func) \
+	__ATTR(name, 0644, show_func, store_func)
+#define AD_ATTR_RW(name) \
+	__ATTR(name, 0644, adios_##name##_show, adios_##name##_store)
+#define AD_ATTR_RO(name) \
+	__ATTR(name, 0444, adios_##name##_show, NULL)
+#define AD_ATTR_WO(name) \
+	__ATTR(name, 0200, NULL, adios_##name##_store)
+
+// Define sysfs attributes for ADIOS scheduler
+static struct elv_fs_entry adios_sched_attrs[] = {
+	AD_ATTR_RO(batch_actual_max),
+	AD_ATTR_RW(bq_refill_below_ratio),
+	AD_ATTR_RW(global_latency_window),
+	AD_ATTR_RW(lat_model_latency_limit),
+	AD_ATTR_RW(batch_order),
+	AD_ATTR_RW(compliance_flags),
+
+	AD_ATTR_RW(batch_limit_read),
+	AD_ATTR_RW(batch_limit_write),
+	AD_ATTR_RW(batch_limit_discard),
+
+	AD_ATTR_RW(lat_model_read),
+	AD_ATTR_RW(lat_model_write),
+	AD_ATTR_RW(lat_model_discard),
+
+	AD_ATTR_RW(lat_target_read),
+	AD_ATTR_RW(lat_target_write),
+	AD_ATTR_RW(lat_target_discard),
+
+	AD_ATTR_RW(shrink_at_kreqs),
+	AD_ATTR_RW(shrink_at_gbytes),
+	AD_ATTR_RW(shrink_resist),
+
+	AD_ATTR_RW(read_priority),
+
+	AD_ATTR_WO(reset_bq_stats),
+	AD_ATTR_WO(reset_lat_model),
+	AD_ATTR(adios_version, adios_version_show, NULL),
+
+	__ATTR_NULL
+};
+
+// Define the ADIOS scheduler type
+static struct elevator_type mq_adios = {
+	.ops = {
+		.next_request		= elv_rb_latter_request,
+		.former_request		= elv_rb_former_request,
+		.limit_depth		= adios_limit_depth,
+		.depth_updated		= adios_depth_updated,
+		.request_merged		= adios_request_merged,
+		.requests_merged	= adios_merged_requests,
+		.bio_merge			= adios_bio_merge,
+		.insert_requests	= adios_insert_requests,
+		.prepare_request	= adios_prepare_request,
+		.dispatch_request	= adios_dispatch_request,
+		.completed_request	= adios_completed_request,
+		.finish_request		= adios_finish_request,
+		.has_work			= adios_has_work,
+		.init_sched			= adios_init_sched,
+		.exit_sched			= adios_exit_sched,
+	},
+	.elevator_attrs = adios_sched_attrs,
+	.elevator_name = "adios",
+	.elevator_owner = THIS_MODULE,
+};
+MODULE_ALIAS("mq-adios-iosched");
+
+#define ADIOS_PROGNAME "Adaptive Deadline I/O Scheduler"
+#define ADIOS_AUTHOR   "Masahito Suzuki"
+
+// Initialize the ADIOS scheduler module
+static int __init adios_init(void) {
+	printk(KERN_INFO "%s %s by %s\n",
+		ADIOS_PROGNAME, ADIOS_VERSION, ADIOS_AUTHOR);
+	return elv_register(&mq_adios);
+}
+
+// Exit the ADIOS scheduler module
+static void __exit adios_exit(void) {
+	elv_unregister(&mq_adios);
+}
+
+module_init(adios_init);
+module_exit(adios_exit);
+
+MODULE_AUTHOR(ADIOS_AUTHOR);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION(ADIOS_PROGNAME);
\ No newline at end of file
diff --git a/block/elevator.c b/block/elevator.c
index f8b3fc16f304dd..49ede90c131d3b 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -741,7 +741,9 @@ void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e,
 void elevator_set_default(struct request_queue *q)
 {
 	struct elv_change_ctx ctx = {
-#if defined(CONFIG_ZEN_INTERACTIVE) && defined(CONFIG_IOSCHED_BFQ)
+#ifdef CONFIG_MQ_IOSCHED_DEFAULT_ADIOS
+		.name = "adios",
+#elif defined(CONFIG_ZEN_INTERACTIVE) && defined(CONFIG_IOSCHED_BFQ)
 		.name = "bfq",
 #else
 		.name = "mq-deadline",
@@ -762,12 +764,15 @@ void elevator_set_default(struct request_queue *q)
 	 * have multiple queues or mq-deadline is not available, default
 	 * to "none".
 	 */
-	if (q->nr_hw_queues != 1 && !blk_mq_is_shared_tags(q->tag_set->flags))
+#ifndef CONFIG_MQ_IOSCHED_DEFAULT_ADIOS
+	bool is_sq = q->nr_hw_queues == 1 || blk_mq_is_shared_tags(q->tag_set->flags);
+	if (!is_sq)
 #if defined(CONFIG_ZEN_INTERACTIVE) && defined(CONFIG_MQ_IOSCHED_KYBER)
 		ctx.name = "kyber";
 #else
 		return;
 #endif
+#endif // CONFIG_MQ_IOSCHED_DEFAULT_ADIOS
 
 	e = elevator_find_get(ctx.name);
 	if (!e)

From a6e3ab8700abdb42b39e2a275449d26a37623894 Mon Sep 17 00:00:00 2001
From: Masahito S <firelzrd@gmail.com>
Date: Fri, 9 May 2025 15:51:43 +0900
Subject: [PATCH 26/26] ADIOS: Make ADIOS the Default I/O scheduler

---
 block/Kconfig.iosched | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index e98585dd83e0f4..dcbcf7c8a3b225 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -18,7 +18,7 @@ config MQ_IOSCHED_KYBER
 
 config MQ_IOSCHED_ADIOS
 	tristate "Adaptive Deadline I/O scheduler"
-	default m
+	default y
 	help
 	  The Adaptive Deadline I/O Scheduler (ADIOS) is a multi-queue I/O
 	  scheduler with learning-based adaptive latency control.
@@ -26,7 +26,7 @@ config MQ_IOSCHED_ADIOS
 config MQ_IOSCHED_DEFAULT_ADIOS
 	bool "Enable ADIOS I/O scheduler as default MQ I/O scheduler"
 	depends on MQ_IOSCHED_ADIOS=y
-	default n
+	default y
 	help
 	  Enable the ADIOS I/O scheduler as the default scheduler for MQ I/O.