diff --git a/Makefile b/Makefile
index a082a1d7c7d9b4..22318c98072274 100644
--- a/Makefile
+++ b/Makefile
@@ -1067,11 +1067,6 @@ KBUILD_CFLAGS	+= -fno-strict-overflow
 # Make sure -fstack-check isn't enabled (like gentoo apparently did)
 KBUILD_CFLAGS  += -fno-stack-check
 
-# conserve stack if available
-ifdef CONFIG_CC_IS_GCC
-KBUILD_CFLAGS   += -fconserve-stack
-endif
-
 # Ensure compilers do not transform certain loops into calls to wcslen()
 KBUILD_CFLAGS += -fno-builtin-wcslen
 
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 1a27efcf3c205a..2f06bd6847edd1 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -75,7 +75,7 @@ export BITS
 #
 #    https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383
 #
-KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx -mno-sse4a
+KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx -mno-sse4a -mno-avx2 -fno-tree-vectorize
 KBUILD_RUSTFLAGS += --target=$(objtree)/scripts/target.json
 KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2
 
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index b3ab80a03365cf..5e883b397ff3f2 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -26,6 +26,7 @@ struct pci_sysdata {
 #if IS_ENABLED(CONFIG_VMD)
 	struct pci_dev	*vmd_dev;	/* VMD Device if in Intel VMD domain */
 #endif
+	struct pci_dev	*nvme_remap_dev;	/* AHCI Device if NVME remapped bus */
 };
 
 extern int pci_routeirq;
@@ -69,6 +70,11 @@ static inline bool is_vmd(struct pci_bus *bus)
 #define is_vmd(bus)		false
 #endif /* CONFIG_VMD */
 
+static inline bool is_nvme_remap(struct pci_bus *bus)
+{
+	return to_pci_sysdata(bus)->nvme_remap_dev != NULL;
+}
+
 /* Can be used to override the logic in pci_scan_bus for skipping
    already-configured bus numbers - to be used for buggy BIOSes
    or architectures with incomplete PCI setup by the loader */
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index ddb798603201ef..7c20387d82029a 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -723,12 +723,15 @@ int pci_ext_cfg_avail(void)
 		return 0;
 }
 
-#if IS_ENABLED(CONFIG_VMD)
 struct pci_dev *pci_real_dma_dev(struct pci_dev *dev)
 {
+#if IS_ENABLED(CONFIG_VMD)
 	if (is_vmd(dev->bus))
 		return to_pci_sysdata(dev->bus)->vmd_dev;
+#endif
+
+	if (is_nvme_remap(dev->bus))
+		return to_pci_sysdata(dev->bus)->nvme_remap_dev;
 
 	return dev;
 }
-#endif
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 27f11320b8d12d..dcbcf7c8a3b225 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -16,6 +16,20 @@ config MQ_IOSCHED_KYBER
 	  synchronous writes, it will self-tune queue depths to achieve that
 	  goal.
 
+config MQ_IOSCHED_ADIOS
+	tristate "Adaptive Deadline I/O scheduler"
+	default y
+	help
+	  The Adaptive Deadline I/O Scheduler (ADIOS) is a multi-queue I/O
+	  scheduler with learning-based adaptive latency control.
+
+config MQ_IOSCHED_DEFAULT_ADIOS
+	bool "Enable ADIOS I/O scheduler as default MQ I/O scheduler"
+	depends on MQ_IOSCHED_ADIOS=y
+	default y
+	help
+	  Enable the ADIOS I/O scheduler as the default scheduler for MQ I/O.
+
 config IOSCHED_BFQ
 	tristate "BFQ I/O scheduler"
 	select BLK_ICQ
diff --git a/block/Makefile b/block/Makefile
index c65f4da937026f..105b12fd86b842 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -22,6 +22,7 @@ obj-$(CONFIG_BLK_CGROUP_IOLATENCY)	+= blk-iolatency.o
 obj-$(CONFIG_BLK_CGROUP_IOCOST)	+= blk-iocost.o
 obj-$(CONFIG_MQ_IOSCHED_DEADLINE)	+= mq-deadline.o
 obj-$(CONFIG_MQ_IOSCHED_KYBER)	+= kyber-iosched.o
+obj-$(CONFIG_MQ_IOSCHED_ADIOS)	+= adios.o
 bfq-y				:= bfq-iosched.o bfq-wf2q.o bfq-cgroup.o
 obj-$(CONFIG_IOSCHED_BFQ)	+= bfq.o
 
@@ -36,3 +37,10 @@ obj-$(CONFIG_BLK_INLINE_ENCRYPTION)	+= blk-crypto.o blk-crypto-profile.o \
 					   blk-crypto-sysfs.o
 obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK)	+= blk-crypto-fallback.o
 obj-$(CONFIG_BLOCK_HOLDER_DEPRECATED)	+= holder.o
+
+all:
+	make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules
+
+clean:
+	make -C /lib/modules/$(shell uname -r)/build M=$(PWD) clean
+
diff --git a/block/adios.c b/block/adios.c
new file mode 100644
index 00000000000000..41de52785cd1ed
--- /dev/null
+++ b/block/adios.c
@@ -0,0 +1,2006 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Adaptive Deadline I/O Scheduler (ADIOS)
+ * Copyright (C) 2025 Masahito Suzuki
+ */
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/compiler.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/math.h>
+#include <linux/module.h>
+#include <linux/rbtree.h>
+#include <linux/sbitmap.h>
+#include <linux/slab.h>
+#include <linux/timekeeping.h>
+#include <linux/percpu.h>
+#include <linux/string.h>
+#include <linux/list_sort.h>
+#include <linux/rcupdate.h>
+
+#include "elevator.h"
+#include "blk.h"
+#include "blk-mq.h"
+#include "blk-mq-sched.h"
+
+#define ADIOS_VERSION "3.1.7"
+
+/* Request Types:
+ *
+ * Tier 0 (Highest Priority): Emergency & System Integrity Requests
+ * -----------------------------------------------------------------
+ * - Target: Requests with the BLK_MQ_INSERT_AT_HEAD flag.
+ * - Purpose: For critical, non-negotiable operations such as device error
+ *   recovery or flush sequences that must bypass all other scheduling logic.
+ * - Implementation: Placed in a dedicated, high-priority FIFO queue
+ *   (`prio_queue[0]`) for immediate dispatch.
+ *
+ * Tier 1 (High Priority): I/O Barrier Guarantees
+ * ---------------------------------------------------------------
+ * - Target: Requests with the REQ_OP_FLUSH flag.
+ * - Purpose: To enforce a strict I/O barrier. When a flush request is
+ *   received, the scheduler stops processing new requests from its main
+ *   queues until all preceding requests have been completed. This guarantees
+ *   the order of operations required by filesystems for data integrity.
+ * - Implementation: A state flag (ADIOS_STATE_BARRIER) halts
+ *   insertion into the main deadline tree. The barrier request and all
+ *   subsequent requests are held in a temporary `barrier_queue`. Once the
+ *   main queues are drained, the barrier request and the subsequent requests
+ *   are released from the pending queue back into the scheduler.
+ *
+ * Tier 2 (Medium Priority): Application Responsiveness
+ * ----------------------------------------------------
+ * - Target: Normal synchronous requests (e.g., from standard file reads).
+ * - Purpose: To ensure correct application behavior for operations that
+ *   depend on sequential I/O completion (e.g., file system mounts) and to
+ *   provide low latency for interactive applications.
+ * - Implementation: The deadline for these requests is set to their start
+ *   time (`rq->start_time_ns`). This effectively enforces FIFO-like behavior
+ *   within the deadline-sorted red-black tree, preventing out-of-order
+ *   execution of dependent synchronous operations.
+ *
+ * Tier 3 (Normal Priority): Background Throughput
+ * -----------------------------------------------
+ * - Target: Asynchronous requests.
+ * - Purpose: To maximize disk throughput for background tasks where latency
+ *   is not critical.
+ * - Implementation: These are the only requests where ADIOS's adaptive
+ *   latency prediction model is used. A dynamic deadline is calculated based
+ *   on the predicted I/O latency, allowing for aggressive reordering to
+ *   optimize I/O efficiency.
+ *
+ * Dispatch Logic:
+ * The scheduler always dispatches requests in strict priority order:
+ * 1. prio_queue[0] (Tier 0)
+ * 2. The deadline-sorted batch queue (which naturally prioritizes Tier 2
+ *    over Tier 3 due to their calculated deadlines).
+ * 3. Barrier-pending requests are handled only after the main queues are empty.
+ */
+
+// Global variable to control the latency
+static u64 default_global_latency_window            = 16000000ULL;
+static u64 default_global_latency_window_rotational = 22000000ULL;
+// Ratio below which batch queues should be refilled
+static u8  default_bq_refill_below_ratio = 20;
+// Maximum latency sample to input
+static u64 default_lat_model_latency_limit = 500000000ULL;
+// Batch ordering strategy
+static u64 default_batch_order = 0;
+
+/* Compliance Flags:
+ * 0x1: Async requests will not be reordered based on the predicted latency
+ */
+enum adios_compliance_flags {
+	ADIOS_CF_FIXORDER  = 1U << 0,
+};
+
+// Flags to control compliance with block layer constraints
+static u64 default_compliance_flags = 0x0;
+
+// Dynamic thresholds for shrinkage
+static u32 default_lm_shrink_at_kreqs  =  5000;
+static u32 default_lm_shrink_at_gbytes =    50;
+static u32 default_lm_shrink_resist    =     2;
+
+enum adios_optype {
+	ADIOS_READ    = 0,
+	ADIOS_WRITE   = 1,
+	ADIOS_DISCARD = 2,
+	ADIOS_OTHER   = 3,
+	ADIOS_OPTYPES = 4,
+};
+
+// Latency targets for each operation type
+static u64 default_latency_target[ADIOS_OPTYPES] = {
+	[ADIOS_READ]    =     2ULL * NSEC_PER_MSEC,
+	[ADIOS_WRITE]   =  2000ULL * NSEC_PER_MSEC,
+	[ADIOS_DISCARD] =  8000ULL * NSEC_PER_MSEC,
+	[ADIOS_OTHER]   =     0ULL * NSEC_PER_MSEC,
+};
+
+// Maximum batch size limits for each operation type
+static u32 default_batch_limit[ADIOS_OPTYPES] = {
+	[ADIOS_READ]    = 36,
+	[ADIOS_WRITE]   = 72,
+	[ADIOS_DISCARD] =  1,
+	[ADIOS_OTHER]   =  1,
+};
+
+enum adios_batch_order {
+	ADIOS_BO_OPTYPE   = 0,
+	ADIOS_BO_ELEVATOR = 1,
+};
+
+// Thresholds for latency model control
+#define LM_BLOCK_SIZE_THRESHOLD 4096
+#define LM_SAMPLES_THRESHOLD    1024
+#define LM_INTERVAL_THRESHOLD   1500
+#define LM_OUTLIER_PERCENTILE     99
+#define LM_LAT_BUCKET_COUNT       64
+
+#define ADIOS_PQ_LEVELS 2
+#define ADIOS_DL_TYPES  2
+#define ADIOS_BQ_PAGES  2
+
+static u32 default_dl_prio[ADIOS_DL_TYPES] = {8, 0};
+
+// Bit flags for the atomic state variable, indicating which queues have requests.
+enum adios_state_flags {
+	ADIOS_STATE_PQ_0      = 1U << 0,
+	ADIOS_STATE_PQ_1      = 1U << 1,
+	ADIOS_STATE_DL_0      = 1U << 2,
+	ADIOS_STATE_DL_1      = 1U << 3,
+	ADIOS_STATE_BQ_PAGE_0 = 1U << 4,
+	ADIOS_STATE_BQ_PAGE_1 = 1U << 5,
+	ADIOS_STATE_BARRIER   = 1U << 6,
+};
+#define ADIOS_STATE_PQ 0
+#define ADIOS_STATE_DL 2
+#define ADIOS_STATE_BQ 4
+#define ADIOS_STATE_BP 6
+
+// Temporal granularity of the deadline tree node (dl_group)
+#define ADIOS_QUANTUM_SHIFT 20
+
+#define ADIOS_MAX_INSERTS_PER_LOCK 72
+#define ADIOS_MAX_DELETES_PER_LOCK 24
+
+// Structure to hold latency bucket data for small requests
+struct latency_bucket_small {
+	u64 weighted_sum_latency;
+	u64 sum_of_weights;
+};
+
+// Structure to hold latency bucket data for large requests
+struct latency_bucket_large {
+	u64 weighted_sum_latency;
+	u64 weighted_sum_block_size;
+	u64 sum_of_weights;
+};
+
+// Structure to hold per-cpu buckets, improving data locality and code clarity.
+struct lm_buckets {
+	struct latency_bucket_small small_bucket[LM_LAT_BUCKET_COUNT];
+	struct latency_bucket_large large_bucket[LM_LAT_BUCKET_COUNT];
+};
+
+// Structure to hold RCU-protected latency model parameters
+struct latency_model_params {
+	u64 base;
+	u64 slope;
+	u64 small_sum_delay;
+	u64 small_count;
+	u64 large_sum_delay;
+	u64 large_sum_bsize;
+	u64 last_update_jiffies;
+	struct rcu_head rcu;
+};
+
+// Structure to hold the latency model context data
+struct latency_model {
+	spinlock_t update_lock;
+	struct latency_model_params __rcu *params;
+
+	// Per-CPU buckets to avoid lock contention on the completion path
+	struct lm_buckets __percpu *pcpu_buckets;
+
+	u32 lm_shrink_at_kreqs;
+	u32 lm_shrink_at_gbytes;
+	u8  lm_shrink_resist;
+};
+
+union adios_in_flight_rqs {
+	atomic64_t	atomic;
+	u64			scalar;
+	struct {
+		u64 	count:          16;
+		u64 	total_pred_lat: 48;
+	};
+};
+
+// Adios scheduler data
+struct adios_data {
+	spinlock_t pq_lock;
+	struct list_head prio_queue[2];
+
+	struct rb_root_cached dl_tree[2];
+	spinlock_t lock;
+	s64 dl_bias;
+	s32 dl_prio[2];
+
+	atomic_t state;
+	u8  bq_state[ADIOS_BQ_PAGES];
+
+	void (*insert_request_fn)(struct blk_mq_hw_ctx *, struct request *,
+								blk_insert_t, struct list_head *);
+
+	u64 global_latency_window;
+	u64 compliance_flags;
+	u64 latency_target[ADIOS_OPTYPES];
+	u32 batch_limit[ADIOS_OPTYPES];
+	u32 batch_actual_max_size[ADIOS_OPTYPES];
+	u32 batch_actual_max_total;
+	u32 async_depth;
+	u32 lat_model_latency_limit;
+	u8  bq_refill_below_ratio;
+	u8  is_rotational;
+	u8  batch_order;
+	u8  elv_direction;
+	sector_t head_pos;
+	sector_t last_completed_pos;
+
+	bool bq_page;
+	struct list_head batch_queue[ADIOS_BQ_PAGES][ADIOS_OPTYPES];
+	u32 batch_count[ADIOS_BQ_PAGES][ADIOS_OPTYPES];
+	u8  bq_batch_order[ADIOS_BQ_PAGES];
+	spinlock_t bq_lock;
+	spinlock_t barrier_lock;
+	struct list_head barrier_queue;
+
+	struct lm_buckets *aggr_buckets;
+
+	struct latency_model latency_model[ADIOS_OPTYPES];
+	struct timer_list update_timer;
+
+	union adios_in_flight_rqs in_flight_rqs;
+	atomic64_t total_pred_lat;
+	u64 last_completed_time;
+
+	struct kmem_cache *rq_data_pool;
+	struct kmem_cache *dl_group_pool;
+
+	struct request_queue *queue;
+};
+
+// List of requests with the same deadline in the deadline-sorted tree
+struct dl_group {
+	struct rb_node node;
+	struct list_head rqs;
+	u64 deadline;
+} __attribute__((aligned(64)));
+
+// Structure to hold scheduler-specific data for each request
+struct adios_rq_data {
+	struct list_head *dl_group;
+	struct list_head dl_node;
+
+	struct request *rq;
+	u64 deadline;
+	u64 pred_lat;
+	u32 block_size;
+	bool managed;
+} __attribute__((aligned(64)));
+
+static const int adios_prio_to_wmult[40] = {
+ /* -20 */     88761,     71755,     56483,     46273,     36291,
+ /* -15 */     29154,     23254,     18705,     14949,     11916,
+ /* -10 */      9548,      7620,      6100,      4904,      3906,
+ /*  -5 */      3121,      2501,      1991,      1586,      1277,
+ /*   0 */      1024,       820,       655,       526,       423,
+ /*   5 */       335,       272,       215,       172,       137,
+ /*  10 */       110,        87,        70,        56,        45,
+ /*  15 */        36,        29,        23,        18,        15,
+};
+
+static inline bool compliant(struct adios_data *ad, u32 flag) {
+	return ad->compliance_flags & flag;
+}
+
+// Count the number of entries in aggregated small buckets
+static u64 lm_count_small_entries(struct latency_bucket_small *buckets) {
+	u64 total_weight = 0;
+	for (u8 i = 0; i < LM_LAT_BUCKET_COUNT; i++)
+		total_weight += buckets[i].sum_of_weights;
+	return total_weight;
+}
+
+// Update the small buckets in the latency model from aggregated data
+static bool lm_update_small_buckets(struct latency_model *model,
+		struct latency_model_params *params,
+		struct latency_bucket_small *buckets,
+		u64 total_weight, bool count_all) {
+	u64 sum_latency = 0;
+	u64 sum_weight = 0;
+	u64 cumulative_weight = 0, threshold_weight = 0;
+	u8  outlier_threshold_bucket = 0;
+	u8  outlier_percentile = LM_OUTLIER_PERCENTILE;
+	u8  reduction;
+
+	if (count_all)
+		outlier_percentile = 100;
+
+	// Calculate the threshold weight for outlier detection
+	threshold_weight = (total_weight * outlier_percentile) / 100;
+
+	// Identify the bucket that corresponds to the outlier threshold
+	for (u8 i = 0; i < LM_LAT_BUCKET_COUNT; i++) {
+		cumulative_weight += buckets[i].sum_of_weights;
+		if (cumulative_weight >= threshold_weight) {
+			outlier_threshold_bucket = i;
+			break;
+		}
+	}
+
+	// Calculate the average latency, excluding outliers
+	for (u8 i = 0; i <= outlier_threshold_bucket; i++) {
+		struct latency_bucket_small *bucket = &buckets[i];
+		if (i < outlier_threshold_bucket) {
+			sum_latency += bucket->weighted_sum_latency;
+			sum_weight += bucket->sum_of_weights;
+		} else {
+			// The threshold bucket's contribution is proportional
+			u64 remaining_weight =
+				threshold_weight - (cumulative_weight - bucket->sum_of_weights);
+			if (bucket->sum_of_weights > 0) {
+				sum_latency += div_u64(bucket->weighted_sum_latency *
+					remaining_weight, bucket->sum_of_weights);
+				sum_weight += remaining_weight;
+			}
+		}
+	}
+
+	// Shrink the model if it reaches at the readjustment threshold
+	if (params->small_count >= 1000ULL * model->lm_shrink_at_kreqs) {
+		reduction = model->lm_shrink_resist;
+		if (params->small_count >> reduction) {
+			params->small_sum_delay -= params->small_sum_delay >> reduction;
+			params->small_count     -= params->small_count     >> reduction;
+		}
+	}
+
+	if (!sum_weight)
+		return false;
+
+	// Accumulate the average latency into the statistics
+	params->small_sum_delay += sum_latency;
+	params->small_count     += sum_weight;
+
+	return true;
+}
+
+// Count the number of entries in aggregated large buckets
+static u64 lm_count_large_entries(struct latency_bucket_large *buckets) {
+	u64 total_weight = 0;
+	for (u8 i = 0; i < LM_LAT_BUCKET_COUNT; i++)
+		total_weight += buckets[i].sum_of_weights;
+	return total_weight;
+}
+
+// Update the large buckets in the latency model from aggregated data
+static bool lm_update_large_buckets(struct latency_model *model,
+		struct latency_model_params *params,
+		struct latency_bucket_large *buckets,
+		u64 total_weight, bool count_all) {
+	s64 sum_latency = 0;
+	u64 sum_block_size = 0, intercept;
+	u64 cumulative_weight = 0, threshold_weight = 0;
+	u64 sum_weight = 0;
+	u8  outlier_threshold_bucket = 0;
+	u8  outlier_percentile = LM_OUTLIER_PERCENTILE;
+	u8  reduction;
+
+	if (count_all)
+		outlier_percentile = 100;
+
+	// Calculate the threshold weight for outlier detection
+	threshold_weight = (total_weight * outlier_percentile) / 100;
+
+	// Identify the bucket that corresponds to the outlier threshold
+	for (u8 i = 0; i < LM_LAT_BUCKET_COUNT; i++) {
+		cumulative_weight += buckets[i].sum_of_weights;
+		if (cumulative_weight >= threshold_weight) {
+			outlier_threshold_bucket = i;
+			break;
+		}
+	}
+
+	// Calculate the average latency and block size, excluding outliers
+	for (u8 i = 0; i <= outlier_threshold_bucket; i++) {
+		struct latency_bucket_large *bucket = &buckets[i];
+		if (i < outlier_threshold_bucket) {
+			sum_latency += bucket->weighted_sum_latency;
+			sum_block_size += bucket->weighted_sum_block_size;
+			sum_weight += bucket->sum_of_weights;
+		} else {
+			// The threshold bucket's contribution is proportional
+			u64 remaining_weight =
+				threshold_weight - (cumulative_weight - bucket->sum_of_weights);
+			if (bucket->sum_of_weights > 0) {
+				sum_latency += div_u64(bucket->weighted_sum_latency *
+					remaining_weight, bucket->sum_of_weights);
+				sum_block_size += div_u64(bucket->weighted_sum_block_size *
+					remaining_weight, bucket->sum_of_weights);
+				sum_weight += remaining_weight;
+			}
+		}
+	}
+
+	if (!sum_weight)
+		return false;
+
+	// Shrink the model if it reaches at the readjustment threshold
+	if (params->large_sum_bsize >= 0x40000000ULL * model->lm_shrink_at_gbytes) {
+		reduction = model->lm_shrink_resist;
+		if (params->large_sum_bsize >> reduction) {
+			params->large_sum_delay -= params->large_sum_delay >> reduction;
+			params->large_sum_bsize -= params->large_sum_bsize >> reduction;
+		}
+	}
+
+	// Accumulate the average delay into the statistics
+	intercept = params->base;
+	if (sum_latency > intercept)
+		sum_latency -= intercept;
+
+	params->large_sum_delay += sum_latency;
+	params->large_sum_bsize += sum_block_size;
+
+	return true;
+}
+
+static void reset_buckets(struct lm_buckets *buckets)
+{ memset(buckets, 0, sizeof(*buckets)); }
+
+static void lm_reset_pcpu_buckets(struct latency_model *model) {
+	int cpu;
+	for_each_possible_cpu(cpu)
+		reset_buckets(per_cpu_ptr(model->pcpu_buckets, cpu));
+}
+
+// Update the latency model parameters and statistics
+static void latency_model_update(
+		struct adios_data *ad, struct latency_model *model) {
+	u64 now;
+	u64 small_weight, large_weight;
+	bool time_elapsed;
+	bool small_processed = false, large_processed = false;
+	struct lm_buckets *aggr = ad->aggr_buckets;
+	struct latency_bucket_small *asb;
+	struct latency_bucket_large *alb;
+	struct lm_buckets *pcpu_b;
+	unsigned long flags;
+	int cpu;
+	struct latency_model_params *old_params, *new_params;
+
+	spin_lock_irqsave(&model->update_lock, flags);
+
+	old_params = rcu_dereference_protected(model->params,
+				lockdep_is_held(&model->update_lock));
+	new_params = kmemdup(old_params, sizeof(*new_params), GFP_ATOMIC);
+	if (!new_params) {
+		spin_unlock_irqrestore(&model->update_lock, flags);
+		return;
+	}
+
+	// Aggregate data from all CPUs and reset per-cpu buckets.
+	for_each_possible_cpu(cpu) {
+		pcpu_b = per_cpu_ptr(model->pcpu_buckets, cpu);
+
+		for (u8 i = 0; i < LM_LAT_BUCKET_COUNT; i++) {
+			if (pcpu_b->small_bucket[i].sum_of_weights) {
+				asb = &aggr->small_bucket[i];
+				asb->sum_of_weights +=
+					pcpu_b->small_bucket[i].sum_of_weights;
+				asb->weighted_sum_latency +=
+					pcpu_b->small_bucket[i].weighted_sum_latency;
+			}
+			if (pcpu_b->large_bucket[i].sum_of_weights) {
+				alb = &aggr->large_bucket[i];
+				alb->sum_of_weights +=
+					pcpu_b->large_bucket[i].sum_of_weights;
+				alb->weighted_sum_latency +=
+					pcpu_b->large_bucket[i].weighted_sum_latency;
+				alb->weighted_sum_block_size +=
+					pcpu_b->large_bucket[i].weighted_sum_block_size;
+			}
+		}
+		// Reset per-cpu buckets after aggregating
+		reset_buckets(pcpu_b);
+	}
+
+	// Count the number of entries in aggregated buckets
+	small_weight = lm_count_small_entries(aggr->small_bucket);
+	large_weight = lm_count_large_entries(aggr->large_bucket);
+
+	// Whether enough time has elapsed since the last update
+	now = jiffies;
+	time_elapsed = unlikely(!new_params->base) ||
+		new_params->last_update_jiffies +
+		msecs_to_jiffies(LM_INTERVAL_THRESHOLD) <= now;
+
+	// Update small buckets
+	if (small_weight && (time_elapsed ||
+			LM_SAMPLES_THRESHOLD <= small_weight || !new_params->base)) {
+		small_processed = lm_update_small_buckets(model, new_params,
+			aggr->small_bucket, small_weight, !new_params->base);
+		memset(&aggr->small_bucket[0], 0, sizeof(aggr->small_bucket));
+	}
+	// Update large buckets
+	if (large_weight && (time_elapsed ||
+			LM_SAMPLES_THRESHOLD <= large_weight || !new_params->slope)) {
+		large_processed = lm_update_large_buckets(model, new_params,
+			aggr->large_bucket, large_weight, !new_params->slope);
+		memset(&aggr->large_bucket[0], 0, sizeof(aggr->large_bucket));
+	}
+
+	// Update the base parameter if small bucket was processed
+	if (small_processed && likely(new_params->small_count))
+		new_params->base = div_u64(new_params->small_sum_delay,
+			new_params->small_count);
+
+	// Update the slope parameter if large bucket was processed
+	if (large_processed && likely(new_params->large_sum_bsize))
+		new_params->slope = div_u64(new_params->large_sum_delay,
+			DIV_ROUND_UP_ULL(new_params->large_sum_bsize, 1024));
+
+	// Update last updated jiffies if update happened or time has elapsed
+	if (small_processed || large_processed || time_elapsed)
+		new_params->last_update_jiffies = now;
+
+	rcu_assign_pointer(model->params, new_params);
+	spin_unlock_irqrestore(&model->update_lock, flags);
+
+	kfree_rcu(old_params, rcu);
+}
+
+// Determine the bucket index for a given measured and predicted latency
+static u8 lm_input_bucket_index(u64 measured, u64 predicted) {
+	u8 bucket_index;
+
+	if (measured < predicted * 2)
+		bucket_index = div_u64((measured * 20), predicted);
+	else if (measured < predicted * 5)
+		bucket_index = div_u64((measured * 10), predicted) + 20;
+	else
+		bucket_index = div_u64((measured * 3), predicted) + 40;
+
+	return bucket_index;
+}
+
+// Input latency data into the latency model
+static void latency_model_input(struct adios_data *ad,
+		struct latency_model *model,
+		u32 block_size, u64 latency, u64 pred_lat, u32 weight) {
+	unsigned long flags;
+	u8 bucket_index;
+	struct lm_buckets *buckets;
+	u64 current_base;
+	struct latency_model_params *params;
+
+	local_irq_save(flags);
+	buckets = per_cpu_ptr(model->pcpu_buckets, __smp_processor_id());
+
+	rcu_read_lock();
+	params = rcu_dereference(model->params);
+	current_base = params->base;
+	rcu_read_unlock();
+
+	if (block_size <= LM_BLOCK_SIZE_THRESHOLD) {
+		// Handle small requests
+		bucket_index = lm_input_bucket_index(latency, current_base ?: 1);
+
+		if (bucket_index >= LM_LAT_BUCKET_COUNT)
+			bucket_index = LM_LAT_BUCKET_COUNT - 1;
+
+		buckets->small_bucket[bucket_index].sum_of_weights += weight;
+		buckets->small_bucket[bucket_index].weighted_sum_latency +=
+			latency * weight;
+
+		local_irq_restore(flags);
+
+		if (unlikely(!current_base)) {
+			latency_model_update(ad, model);
+			return;
+		}
+	} else {
+		// Handle large requests
+		if (!current_base || !pred_lat) {
+			local_irq_restore(flags);
+			return;
+		}
+
+		bucket_index = lm_input_bucket_index(latency, pred_lat);
+
+		if (bucket_index >= LM_LAT_BUCKET_COUNT)
+			bucket_index = LM_LAT_BUCKET_COUNT - 1;
+
+		buckets->large_bucket[bucket_index].sum_of_weights += weight;
+		buckets->large_bucket[bucket_index].weighted_sum_latency +=
+			latency * weight;
+		buckets->large_bucket[bucket_index].weighted_sum_block_size +=
+			block_size * weight;
+
+		local_irq_restore(flags);
+	}
+}
+
+// Predict the latency for a given block size using the latency model
+static u64 latency_model_predict(struct latency_model *model, u32 block_size) {
+	u64 result;
+	struct latency_model_params *params;
+
+	rcu_read_lock();
+	params = rcu_dereference(model->params);
+
+	result = params->base;
+	if (block_size > LM_BLOCK_SIZE_THRESHOLD)
+		result += params->slope *
+			DIV_ROUND_UP_ULL(block_size - LM_BLOCK_SIZE_THRESHOLD, 1024);
+
+	rcu_read_unlock();
+
+	return result;
+}
+
+// Determine the type of operation based on request flags
+static u8 adios_optype(struct request *rq) {
+	switch (rq->cmd_flags & REQ_OP_MASK) {
+	case REQ_OP_READ:
+		return ADIOS_READ;
+	case REQ_OP_WRITE:
+		return ADIOS_WRITE;
+	case REQ_OP_DISCARD:
+		return ADIOS_DISCARD;
+	default:
+		return ADIOS_OTHER;
+	}
+}
+
+static inline u8 adios_optype_not_read(struct request *rq) {
+	return (rq->cmd_flags & REQ_OP_MASK) != REQ_OP_READ;
+}
+
+// Helper function to retrieve adios_rq_data from a request
+static inline struct adios_rq_data *get_rq_data(struct request *rq) {
+	return rq->elv.priv[0];
+}
+
+static inline
+void set_adios_state(struct adios_data *ad, u32 shift, u32 idx, bool flag) {
+	if (flag)
+		atomic_or(1U << (idx + shift), &ad->state);
+	else
+		atomic_andnot(1U << (idx + shift), &ad->state);
+}
+
+static inline u32 get_adios_state(struct adios_data *ad)
+{ return atomic_read(&ad->state); }
+
+static inline u32 eval_this_adios_state(u32 state, u32 shift)
+{ return (state >> shift) & 0x3; }
+
+static inline u32 eval_adios_state(struct adios_data *ad, u32 shift)
+{ return eval_this_adios_state(get_adios_state(ad), shift); }
+
+// Add a request to the deadline-sorted red-black tree
+static void add_to_dl_tree(
+		struct adios_data *ad, bool dl_idx, struct request *rq) {
+	struct rb_root_cached *root = &ad->dl_tree[dl_idx];
+	struct rb_node **link = &(root->rb_root.rb_node), *parent = NULL;
+	bool leftmost = true;
+	struct adios_rq_data *rd = get_rq_data(rq);
+	struct dl_group *dlg;
+	u64 deadline;
+	bool was_empty = RB_EMPTY_ROOT(&root->rb_root);
+
+	/* Tier-2: Synchronous Requests
+	 * - Needs to be FIFO within a same optype
+	 * - Relaxed order between different optypes
+	 * - basically needs to be processed in early time */
+	rd->deadline = rq->start_time_ns;
+
+	/* Tier-3: Aynchronous Requests
+	 * - Can be reordered and delayed freely */
+	if (!(rq->cmd_flags & REQ_SYNC)) {
+		rd->deadline += ad->latency_target[adios_optype(rq)];
+		if (!compliant(ad, ADIOS_CF_FIXORDER))
+			rd->deadline += rd->pred_lat;
+	}
+
+	// Now quantize the deadline (-> dlg->deadline == RB-Tree key)
+	deadline = rd->deadline & ~((1ULL << ADIOS_QUANTUM_SHIFT) - 1);
+
+	while (*link) {
+		dlg = rb_entry(*link, struct dl_group, node);
+		s64 diff = deadline - dlg->deadline;
+
+		parent = *link;
+		if (diff < 0) {
+			link = &((*link)->rb_left);
+		} else if (diff > 0) {
+			link = &((*link)->rb_right);
+			leftmost = false;
+		} else { // diff == 0
+			goto found;
+		}
+	}
+
+	dlg = rb_entry_safe(parent, struct dl_group, node);
+	if (!dlg || dlg->deadline != deadline) {
+		dlg = kmem_cache_zalloc(ad->dl_group_pool, GFP_ATOMIC);
+		if (!dlg)
+			return;
+		dlg->deadline = deadline;
+		INIT_LIST_HEAD(&dlg->rqs);
+		rb_link_node(&dlg->node, parent, link);
+		rb_insert_color_cached(&dlg->node, root, leftmost);
+	}
+found:
+	list_add_tail(&rd->dl_node, &dlg->rqs);
+	rd->dl_group = &dlg->rqs;
+
+	if (was_empty)
+		set_adios_state(ad, ADIOS_STATE_DL, dl_idx, true);
+}
+
+// Remove a request from the deadline-sorted red-black tree
+static void del_from_dl_tree(
+		struct adios_data *ad, bool dl_idx, struct request *rq) {
+	struct rb_root_cached *root = &ad->dl_tree[dl_idx];
+	struct adios_rq_data *rd = get_rq_data(rq);
+	struct dl_group *dlg = container_of(rd->dl_group, struct dl_group, rqs);
+
+	list_del_init(&rd->dl_node);
+	if (list_empty(&dlg->rqs)) {
+		rb_erase_cached(&dlg->node, root);
+		kmem_cache_free(ad->dl_group_pool, dlg);
+	}
+	rd->dl_group = NULL;
+
+	if (RB_EMPTY_ROOT(&ad->dl_tree[dl_idx].rb_root))
+		set_adios_state(ad, ADIOS_STATE_DL, dl_idx, false);
+}
+
+// Remove a request from the scheduler
+static void remove_request(struct adios_data *ad, struct request *rq) {
+	bool dl_idx = adios_optype_not_read(rq);
+	struct request_queue *q = rq->q;
+	struct adios_rq_data *rd = get_rq_data(rq);
+
+	list_del_init(&rq->queuelist);
+
+	// We might not be on the rbtree, if we are doing an insert merge
+	if (rd->dl_group)
+		del_from_dl_tree(ad, dl_idx, rq);
+
+	elv_rqhash_del(q, rq);
+	if (q->last_merge == rq)
+		q->last_merge = NULL;
+}
+
+// Convert a queue depth to the corresponding word depth for shallow allocation
+static int to_word_depth(struct blk_mq_hw_ctx *hctx, unsigned int qdepth) {
+	struct sbitmap_queue *bt = &hctx->sched_tags->bitmap_tags;
+	const unsigned int nrr = hctx->queue->nr_requests;
+
+	return ((qdepth << bt->sb.shift) + nrr - 1) / nrr;
+}
+
+// We limit the depth of request allocation for asynchronous and write requests
+static void adios_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) {
+	struct adios_data *ad = data->q->elevator->elevator_data;
+
+	// Do not throttle synchronous reads
+	if (op_is_sync(opf) && !op_is_write(opf))
+		return;
+
+	data->shallow_depth = to_word_depth(data->hctx, ad->async_depth);
+}
+
+// The number of requests in the queue was notified from the block layer
+static void adios_depth_updated(struct request_queue *q) {
+	struct adios_data *ad = q->elevator->elevator_data;
+
+	ad->async_depth = q->nr_requests;
+	blk_mq_set_min_shallow_depth(q, 1);
+}
+
+// Handle request merging after a merge operation
+static void adios_request_merged(struct request_queue *q, struct request *req,
+				  enum elv_merge type) {
+	bool dl_idx = adios_optype_not_read(req);
+	struct adios_data *ad = q->elevator->elevator_data;
+
+	// Reposition request in the deadline-sorted tree
+	del_from_dl_tree(ad, dl_idx, req);
+	add_to_dl_tree(ad, dl_idx, req);
+}
+
+// Handle merging of requests after one has been merged into another
+static void adios_merged_requests(struct request_queue *q, struct request *req,
+				   struct request *next) {
+	struct adios_data *ad = q->elevator->elevator_data;
+
+	lockdep_assert_held(&ad->lock);
+
+	// kill knowledge of next, this one is a goner
+	remove_request(ad, next);
+}
+
+// Try to merge a bio into an existing rq before associating it with an rq
+static bool adios_bio_merge(struct request_queue *q, struct bio *bio,
+		unsigned int nr_segs) {
+	unsigned long flags;
+	struct adios_data *ad = q->elevator->elevator_data;
+	struct request *free = NULL;
+	bool ret;
+
+	if (eval_adios_state(ad, ADIOS_STATE_BP))
+		return false;
+
+	if (!spin_trylock_irqsave(&ad->lock, flags))
+		return false;
+
+	ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free);
+	spin_unlock_irqrestore(&ad->lock, flags);
+
+	if (free)
+		blk_mq_free_request(free);
+
+	return ret;
+}
+
+static bool merge_or_insert_to_dl_tree(struct adios_data *ad,
+		struct request *rq, struct request_queue *q, struct list_head *free) {
+	if (blk_mq_sched_try_insert_merge(q, rq, free))
+		return true;
+
+	bool dl_idx = adios_optype_not_read(rq);
+	add_to_dl_tree(ad, dl_idx, rq);
+
+	if (rq_mergeable(rq)) {
+		elv_rqhash_add(q, rq);
+		if (!q->last_merge)
+			q->last_merge = rq;
+	}
+
+	return false;
+}
+
+static void insert_to_prio_queue(struct adios_data *ad,
+		struct request *rq, bool pq_idx) {
+	struct adios_rq_data *rd = get_rq_data(rq);
+
+	/* We're sure that rd->managed == true */
+	union adios_in_flight_rqs ifr = {
+		.count          = 1,
+		.total_pred_lat = rd->pred_lat,
+	};
+	atomic64_add(ifr.scalar, &ad->in_flight_rqs.atomic);
+
+	scoped_guard(spinlock_irqsave, &ad->pq_lock) {
+		bool was_empty = list_empty(&ad->prio_queue[pq_idx]);
+		list_add_tail(&rq->queuelist, &ad->prio_queue[pq_idx]);
+		if (was_empty)
+			set_adios_state(ad, ADIOS_STATE_PQ, pq_idx, true);
+	}
+}
+
+// Insert a request into the scheduler (after Read & Write models stabilized)
+static void insert_request_post_stability(struct blk_mq_hw_ctx *hctx,
+		struct request *rq, blk_insert_t insert_flags, struct list_head *free) {
+	struct request_queue *q = hctx->queue;
+	struct adios_data *ad = q->elevator->elevator_data;
+	struct adios_rq_data *rd = get_rq_data(rq);
+	u8 optype = adios_optype(rq);
+	bool rq_is_flush;
+
+	rd->managed = true;
+	rd->block_size = blk_rq_bytes(rq);
+	rd->pred_lat =
+		latency_model_predict(&ad->latency_model[optype], rd->block_size);
+
+	/* Tier-0: BLK_MQ_INSERT_AT_HEAD Requests */
+	if (insert_flags & BLK_MQ_INSERT_AT_HEAD) {
+		insert_to_prio_queue(ad, rq, 0);
+		return;
+	}
+
+	/*
+	 * Strict Barrier Handling for REQ_OP_FLUSH:
+	 * If a flush request arrives, or if the scheduler is already in a
+	 * barrier-pending state, all subsequent requests are diverted to a
+	 * separate barrier_queue. This ensures that no new requests are processed
+	 * until all work preceding the barrier is complete.
+	 */
+	rq_is_flush = rq->cmd_flags & REQ_OP_FLUSH;
+	if (eval_adios_state(ad, ADIOS_STATE_BP) || rq_is_flush) {
+		scoped_guard(spinlock_irqsave, &ad->barrier_lock) {
+			if (rq_is_flush)
+				set_adios_state(ad, ADIOS_STATE_BP, 0, true);
+			list_add_tail(&rq->queuelist, &ad->barrier_queue);
+		}
+		return;
+	}
+
+	if (merge_or_insert_to_dl_tree(ad, rq, q, free))
+		return;
+}
+
+// Insert a request into the scheduler (before Read & Write models stabilizes)
+static void insert_request_pre_stability(struct blk_mq_hw_ctx *hctx,
+		struct request *rq, blk_insert_t insert_flags, struct list_head *free) {
+	struct adios_data *ad = hctx->queue->elevator->elevator_data;
+	struct adios_rq_data *rd = get_rq_data(rq);
+	u8 optype = adios_optype(rq);
+	u8 pq_idx = !(insert_flags & BLK_MQ_INSERT_AT_HEAD);
+	bool models_stable = false;
+
+	rd->managed = true;
+	rd->block_size = blk_rq_bytes(rq);
+	rd->pred_lat =
+		latency_model_predict(&ad->latency_model[optype], rd->block_size);
+
+	insert_to_prio_queue(ad, rq, pq_idx);
+
+	rcu_read_lock();
+	if (rcu_dereference(ad->latency_model[ADIOS_READ].params)->base > 0 &&
+		rcu_dereference(ad->latency_model[ADIOS_WRITE].params)->base > 0)
+			models_stable = true;
+	rcu_read_unlock();
+
+	if (models_stable)
+		ad->insert_request_fn = insert_request_post_stability;
+}
+
+// Insert multiple requests into the scheduler
+static void adios_insert_requests(struct blk_mq_hw_ctx *hctx,
+				   struct list_head *list,
+				   blk_insert_t insert_flags) {
+	struct request_queue *q = hctx->queue;
+	struct adios_data *ad = q->elevator->elevator_data;
+	struct request *rq;
+	bool stop = false;
+	LIST_HEAD(free);
+
+	do {
+	scoped_guard(spinlock_irqsave, &ad->lock)
+	for (int i = 0; i < ADIOS_MAX_INSERTS_PER_LOCK; i++) {
+		if (list_empty(list)) {
+			stop = true;
+			break;
+		}
+		rq = list_first_entry(list, struct request, queuelist);
+		list_del_init(&rq->queuelist);
+		ad->insert_request_fn(hctx, rq, insert_flags, &free);
+	}} while (!stop);
+
+	blk_mq_free_requests(&free);
+}
+
+// Prepare a request before it is inserted into the scheduler
+static void adios_prepare_request(struct request *rq) {
+	struct adios_data *ad = rq->q->elevator->elevator_data;
+	struct adios_rq_data *rd = get_rq_data(rq);
+
+	rq->elv.priv[0] = NULL;
+
+	/* Allocate adios_rq_data from the memory pool */
+	rd = kmem_cache_zalloc(ad->rq_data_pool, GFP_ATOMIC);
+	if (WARN(!rd, "adios_prepare_request: "
+			"Failed to allocate memory from rq_data_pool. rd is NULL\n"))
+		return;
+
+	rd->rq = rq;
+	rq->elv.priv[0] = rd;
+}
+
+static struct adios_rq_data *get_dl_first_rd(struct adios_data *ad, bool idx) {
+	struct rb_root_cached *root = &ad->dl_tree[idx];
+	struct rb_node *first = rb_first_cached(root);
+	struct dl_group *dl_group = rb_entry(first, struct dl_group, node);
+
+	return list_first_entry(&dl_group->rqs, struct adios_rq_data, dl_node);
+}
+
+// Comparison function for sorting requests by block address
+static int cmp_rq_pos(void *priv,
+		const struct list_head *a, const struct list_head *b) {
+	struct request *rq_a = list_entry(a, struct request, queuelist);
+	struct request *rq_b = list_entry(b, struct request, queuelist);
+	u64 pos_a = blk_rq_pos(rq_a);
+	u64 pos_b = blk_rq_pos(rq_b);
+
+	return (int)(pos_a > pos_b) - (int)(pos_a < pos_b);
+}
+
+#ifndef list_last_entry_or_null
+#define list_last_entry_or_null(ptr, type, member) \
+	(!list_empty(ptr) ? list_last_entry(ptr, type, member) : NULL)
+#endif
+
+// Update the elevator direction
+static void update_elv_direction(struct adios_data *ad) {
+	if (!ad->is_rotational)
+		return;
+
+	bool page = ad->bq_page;
+	struct list_head *q = &ad->batch_queue[page][1];
+	if (ad->bq_batch_order[page] < ADIOS_BO_ELEVATOR || list_empty(q)) {
+		ad->elv_direction = 0;
+		return;
+	}
+
+	// Get first and last request positions in the queue
+	struct request *rq_a = list_first_entry(q, struct request, queuelist);
+	struct request *rq_b = list_last_entry (q, struct request, queuelist);
+	u64 pos_a = blk_rq_pos(rq_a);
+	u64 pos_b = blk_rq_pos(rq_b);
+	u64 avg_rq_pos = (pos_a + pos_b) >> 1;
+
+	ad->elv_direction = !!(ad->head_pos > avg_rq_pos);
+}
+
+// Fill the batch queues with requests from the deadline-sorted red-black tree
+static bool fill_batch_queues(struct adios_data *ad, u64 tpl) {
+	struct adios_rq_data *rd;
+	struct request *rq;
+	struct list_head *dest_q;
+	u8  dest_idx;
+	u64 added_lat = 0;
+	u32 optype_count[ADIOS_OPTYPES] = {0};
+	u32 count = 0;
+	u8 optype;
+	bool page = !ad->bq_page, dl_idx, bias_idx, update_bias;
+	u32 dl_queued;
+	u8 bq_batch_order;
+	bool stop = false;
+
+	// Reset batch queue counts for the back page
+	memset(&ad->batch_count[page], 0, sizeof(ad->batch_count[page]));
+
+	ad->bq_batch_order[page] =
+		bq_batch_order = ad->batch_order;
+
+	do {
+	scoped_guard(spinlock_irqsave, &ad->lock)
+	for (int i = 0; i < ADIOS_MAX_DELETES_PER_LOCK; i++) {
+		bool has_base = false;
+
+		dl_queued = eval_adios_state(ad, ADIOS_STATE_DL);
+		// Check if there are any requests queued in the deadline tree
+		if (!dl_queued) {
+			stop = true;
+			break;
+		}
+
+		// Reads if both queues have requests, otherwise pick the non-empty.
+		dl_idx = dl_queued >> 1;
+
+		// Get the first request from the deadline-sorted tree
+		rd = get_dl_first_rd(ad, dl_idx);
+
+		bias_idx = ad->dl_bias < 0;
+		// If read and write requests are queued, choose one based on bias
+		if (dl_queued == 0x3) {
+			struct adios_rq_data *trd[2] = {get_dl_first_rd(ad, 0), rd};
+			rd = trd[bias_idx];
+
+			update_bias = (trd[bias_idx]->deadline > trd[!bias_idx]->deadline);
+		} else
+			update_bias = (bias_idx == dl_idx);
+
+		rq = rd->rq;
+		optype = adios_optype(rq);
+
+		rcu_read_lock();
+		has_base =
+			!!rcu_dereference(ad->latency_model[optype].params)->base;
+		rcu_read_unlock();
+
+		// Check batch size and total predicted latency
+		if (count && (!has_base ||
+				ad->batch_count[page][optype] >= ad->batch_limit[optype] ||
+				(tpl + added_lat + rd->pred_lat) > ad->global_latency_window)) {
+			stop = true;
+			break;
+		}
+
+		if (update_bias) {
+			s64 sign = ((s64)bias_idx << 1) - 1;
+			if (unlikely(!rd->pred_lat))
+				ad->dl_bias = sign;
+			else
+				// Adjust the bias based on the predicted latency
+				ad->dl_bias += sign * (s64)((rd->pred_lat *
+					adios_prio_to_wmult[ad->dl_prio[bias_idx] + 20]) >> 10);
+		}
+
+		remove_request(ad, rq);
+
+		// Add request to the corresponding batch queue
+		dest_idx = (bq_batch_order == ADIOS_BO_OPTYPE || optype == ADIOS_OTHER)?
+			optype : !!(rd->deadline != rq->start_time_ns);
+		dest_q = &ad->batch_queue[page][dest_idx];
+		list_add_tail(&rq->queuelist, dest_q);
+		ad->bq_state[page] |= 1U << dest_idx;
+		ad->batch_count[page][optype]++;
+		optype_count[optype]++;
+		added_lat += rd->pred_lat;
+		count++;
+	}} while (!stop);
+
+	if (bq_batch_order == ADIOS_BO_ELEVATOR && ad->batch_count[page][1] > 1)
+			list_sort(NULL, &ad->batch_queue[page][1], cmp_rq_pos);
+
+	if (count) {
+		/* We're sure that every request's rd->managed == true */
+		union adios_in_flight_rqs ifr = {
+			.count          = count,
+			.total_pred_lat = added_lat,
+		};
+		atomic64_add(ifr.scalar, &ad->in_flight_rqs.atomic);
+
+		set_adios_state(ad, ADIOS_STATE_BQ, page, true);
+
+		for (optype = 0; optype < ADIOS_OPTYPES; optype++)
+			if (ad->batch_actual_max_size[optype] < optype_count[optype])
+				ad->batch_actual_max_size[optype] = optype_count[optype];
+		if (ad->batch_actual_max_total < count)
+			ad->batch_actual_max_total = count;
+	}
+	return count;
+}
+
+// Flip to the next batch queue page
+static void flip_bq_page(struct adios_data *ad) {
+	ad->bq_page = !ad->bq_page;
+	update_elv_direction(ad);
+}
+
+// Pop a request from the specified index (optype or elevator tier)
+static inline struct request *pop_bq_request(
+		struct adios_data *ad, u8 idx, bool direction) {
+	bool page = ad->bq_page;
+	struct list_head *q = &ad->batch_queue[page][idx];
+	struct request *rq = direction ?
+		list_last_entry_or_null (q, struct request, queuelist):
+		list_first_entry_or_null(q, struct request, queuelist);
+	if (rq) {
+		list_del_init(&rq->queuelist);
+		if (list_empty(q))
+			ad->bq_state[page] &= ~(1U << idx);
+	}
+	return rq;
+}
+
+static struct request *pop_next_bq_request_optype(struct adios_data *ad) {
+	u32 bq_state = ad->bq_state[ad->bq_page];
+	if (!bq_state) return NULL;
+
+	struct request *rq;
+	u32 bq_idx = __builtin_ctz(bq_state);
+
+	// Dispatch based on optype (FIFO within each) or single-queue elevator
+	rq = pop_bq_request(ad, bq_idx, false);
+	return rq;
+}
+
+static struct request *pop_next_bq_request_elevator(struct adios_data *ad) {
+	u32 bq_state = ad->bq_state[ad->bq_page];
+	if (!bq_state) return NULL;
+
+	struct request *rq;
+	u32 bq_idx = __builtin_ctz(bq_state);
+	bool direction = (bq_idx == 1) & ad->elv_direction;
+
+	// Tier-2 (sync) is always high priority
+	// Tier-3 (async) uses the pre-calculated elevator direction
+	rq = pop_bq_request(ad, bq_idx, direction);
+
+	/* If batch queue for the sync requests just became empty */
+	if (bq_idx == 0 && rq && !(bq_state & 0x1))
+		update_elv_direction(ad);
+
+	return rq;
+}
+
+// Returns the state of the batch queue page
+static inline bool bq_page_has_rq(u32 bq_state, bool page) {
+	return bq_state & (1U << page);
+}
+
+// Dispatch a request from the batch queues
+static struct request *dispatch_from_bq(struct adios_data *ad) {
+	struct request *rq;
+
+	guard(spinlock_irqsave)(&ad->bq_lock);
+
+	u32 state = get_adios_state(ad);
+	u32 bq_state = eval_this_adios_state(state, ADIOS_STATE_BQ);
+	u32 bq_curr_page_has_rq = bq_page_has_rq(bq_state, ad->bq_page);
+	union adios_in_flight_rqs ifr;
+	ifr.scalar = atomic64_read(&ad->in_flight_rqs.atomic);
+	u64 tpl = ifr.total_pred_lat;
+
+	// Refill the batch queues if the back page is empty, dl_tree has work, and
+	// current page is empty or the total ongoing latency is below the threshold
+	if (!bq_page_has_rq(bq_state, !ad->bq_page) &&
+			(!bq_curr_page_has_rq || (!tpl || tpl < div_u64(
+			ad->global_latency_window * ad->bq_refill_below_ratio, 100))) &&
+			eval_this_adios_state(state, ADIOS_STATE_DL))
+		fill_batch_queues(ad, tpl);
+
+	// If current batch queue page is empty, and the other page has work, flip
+	if (!bq_curr_page_has_rq &&
+			bq_page_has_rq(eval_adios_state(ad, ADIOS_STATE_BQ), !ad->bq_page))
+		flip_bq_page(ad);
+
+	// Use the per-page state to decide the dispatch logic, ensuring correctness
+	rq = (ad->bq_batch_order[ad->bq_page] == ADIOS_BO_ELEVATOR) ?
+		pop_next_bq_request_elevator(ad):
+		pop_next_bq_request_optype(ad);
+
+	if (rq) {
+		bool page = ad->bq_page;
+		bool is_empty = !ad->bq_state[page];
+		if (is_empty)
+			set_adios_state(ad, ADIOS_STATE_BQ, page, false);
+		return rq;
+	}
+
+	return NULL;
+}
+
+// Dispatch a request from the priority queue
+static struct request *dispatch_from_pq(struct adios_data *ad) {
+	struct request *rq = NULL;
+
+	guard(spinlock_irqsave)(&ad->pq_lock);
+	u32 pq_state = eval_adios_state(ad, ADIOS_STATE_PQ);
+	u8  pq_idx = pq_state >> 1;
+	struct list_head *q = &ad->prio_queue[pq_idx];
+
+	if (unlikely(list_empty(q))) return NULL;
+
+	rq = list_first_entry(q, struct request, queuelist);
+	list_del_init(&rq->queuelist);
+	if (list_empty(q)) {
+		set_adios_state(ad, ADIOS_STATE_PQ, pq_idx, false);
+		update_elv_direction(ad);
+	}
+	return rq;
+}
+
+static bool release_barrier_requests(struct adios_data *ad) {
+	u32 moved_count = 0;
+	LIST_HEAD(local_list);
+
+	scoped_guard(spinlock_irqsave, &ad->barrier_lock) {
+		if (!list_empty(&ad->barrier_queue)) {
+			struct request *trq, *next;
+			bool first_barrier_moved = false;
+
+			list_for_each_entry_safe(trq, next, &ad->barrier_queue, queuelist) {
+				if (!first_barrier_moved) {
+					list_del_init(&trq->queuelist);
+					insert_to_prio_queue(ad, trq, 1);
+					moved_count++;
+					first_barrier_moved = true;
+					continue;
+				}
+
+				if (trq->cmd_flags & REQ_OP_FLUSH)
+					break;
+
+				list_move_tail(&trq->queuelist, &local_list);
+				moved_count++;
+			}
+
+			if (list_empty(&ad->barrier_queue))
+				set_adios_state(ad, ADIOS_STATE_BP, 0, false);
+		}
+	}
+
+	if (!moved_count)
+		return false;
+
+	if (!list_empty(&local_list)) {
+		struct request *trq, *next;
+		LIST_HEAD(free_list);
+
+		/* ad->lock is already held */
+		list_for_each_entry_safe(trq, next, &local_list, queuelist) {
+			list_del_init(&trq->queuelist);
+			if (merge_or_insert_to_dl_tree(ad, trq, ad->queue, &free_list))
+				continue;
+		}
+
+		if (!list_empty(&free_list))
+			blk_mq_free_requests(&free_list);
+	}
+
+	return true;
+}
+
+// Dispatch a request to the hardware queue
+static struct request *adios_dispatch_request(struct blk_mq_hw_ctx *hctx) {
+	struct adios_data *ad = hctx->queue->elevator->elevator_data;
+	struct request *rq;
+
+retry:
+	rq = dispatch_from_pq(ad);
+	if (rq)
+		goto found;
+
+	rq = dispatch_from_bq(ad);
+	if (rq)
+		goto found;
+
+	/*
+	 * If all active queues are empty, check if we need to process a barrier.
+	 * This is the trigger to release requests that were held in barrier_queue
+	 * due to a REQ_OP_FLUSH barrier.
+	 */
+	if (eval_adios_state(ad, ADIOS_STATE_BP)) {
+		union adios_in_flight_rqs ifr;
+		ifr.scalar = atomic64_read(&ad->in_flight_rqs.atomic);
+		if (!ifr.count) {
+			bool barrier_released = false;
+			scoped_guard(spinlock_irqsave, &ad->lock)
+				barrier_released = release_barrier_requests(ad);
+			if (barrier_released)
+				goto retry;
+		}
+	}
+
+	return NULL;
+found:
+	if (ad->is_rotational)
+		ad->head_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
+
+	rq->rq_flags |= RQF_STARTED;
+	return rq;
+}
+
+// Timer callback function to periodically update latency models
+static void update_timer_callback(struct timer_list *t) {
+	struct adios_data *ad = timer_container_of(ad, t, update_timer);
+
+	for (u8 optype = 0; optype < ADIOS_OPTYPES; optype++)
+		latency_model_update(ad, &ad->latency_model[optype]);
+}
+
+// Handle the completion of a request
+static void adios_completed_request(struct request *rq, u64 now) {
+	struct adios_data *ad = rq->q->elevator->elevator_data;
+	struct adios_rq_data *rd = get_rq_data(rq);
+	union adios_in_flight_rqs ifr = { .scalar = 0 };
+
+	if (rd->managed) {
+		union adios_in_flight_rqs ifr_to_sub = {
+			.count          = 1,
+			.total_pred_lat = rd->pred_lat,
+		};
+		ifr.scalar = atomic64_sub_return(
+			ifr_to_sub.scalar, &ad->in_flight_rqs.atomic);
+	}
+	u8 optype = adios_optype(rq);
+
+	if (optype == ADIOS_OTHER) {
+		// Non-positional commands make the head position unpredictable.
+		// Invalidate our knowledge of the last completed position.
+		if (ad->is_rotational)
+			ad->last_completed_pos = 0;
+		return;
+	}
+
+	u64 lct = ad->last_completed_time ?: rq->io_start_time_ns;
+	ad->last_completed_time = (ifr.count) ? now : 0;
+
+	if (!rq->io_start_time_ns || !rd->block_size || unlikely(now < lct))
+		return;
+
+	u64 latency = now - lct;
+	if (latency > ad->lat_model_latency_limit)
+		return;
+
+	u32 weight = 1;
+	if (ad->is_rotational) {
+		sector_t current_pos = blk_rq_pos(rq);
+		// Only calculate seek distance if we have a valid last position.
+		if (ad->last_completed_pos > 0) {
+			u64 seek_distance = abs(
+				(s64)current_pos - (s64)ad->last_completed_pos);
+			weight = 65 - __builtin_clzll(seek_distance);
+		}
+		// Update (or re-synchronize) our knowledge of the head position.
+		ad->last_completed_pos = current_pos + blk_rq_sectors(rq);
+	}
+
+	latency_model_input(ad, &ad->latency_model[optype],
+		rd->block_size, latency, rd->pred_lat, weight);
+	timer_reduce(&ad->update_timer, jiffies + msecs_to_jiffies(100));
+}
+
+// Clean up after a request is finished
+static void adios_finish_request(struct request *rq) {
+	struct adios_data *ad = rq->q->elevator->elevator_data;
+
+	if (rq->elv.priv[0]) {
+		// Free adios_rq_data back to the memory pool
+		kmem_cache_free(ad->rq_data_pool, get_rq_data(rq));
+		rq->elv.priv[0] = NULL;
+	}
+}
+
+// Check if there are any requests available for dispatch
+static bool adios_has_work(struct blk_mq_hw_ctx *hctx) {
+	struct adios_data *ad = hctx->queue->elevator->elevator_data;
+
+	return atomic_read(&ad->state) != 0;
+}
+
+// Initialize the scheduler-specific data when initializing the request queue
+static int adios_init_sched(struct request_queue *q, struct elevator_queue *eq) {
+	struct adios_data *ad;
+	int ret = -ENOMEM;
+	u8 optype = 0;
+
+	ad = kzalloc_node(sizeof(*ad), GFP_KERNEL, q->node);
+	if (!ad) {
+		pr_err("adios: Failed to create adios_data\n");
+		goto put_eq;
+	}
+
+	eq->elevator_data = ad;
+
+	// Create a memory pool for adios_rq_data
+	ad->rq_data_pool = kmem_cache_create("rq_data_pool",
+						sizeof(struct adios_rq_data),
+						0, SLAB_HWCACHE_ALIGN, NULL);
+	if (!ad->rq_data_pool) {
+		pr_err("adios: Failed to create rq_data_pool\n");
+		goto free_ad;
+	}
+
+	/* Create a memory pool for dl_group */
+	ad->dl_group_pool = kmem_cache_create("dl_group_pool",
+						sizeof(struct dl_group),
+						0, SLAB_HWCACHE_ALIGN, NULL);
+	if (!ad->dl_group_pool) {
+		pr_err("adios: Failed to create dl_group_pool\n");
+		goto destroy_rq_data_pool;
+	}
+
+	for (int i = 0; i < ADIOS_PQ_LEVELS; i++)
+		INIT_LIST_HEAD(&ad->prio_queue[i]);
+
+	for (u8 i = 0; i < ADIOS_DL_TYPES; i++) {
+		ad->dl_tree[i] = RB_ROOT_CACHED;
+		ad->dl_prio[i] = default_dl_prio[i];
+	}
+	ad->dl_bias = 0;
+
+	for (u8 page = 0; page < ADIOS_BQ_PAGES; page++)
+		for (optype = 0; optype < ADIOS_OPTYPES; optype++)
+			INIT_LIST_HEAD(&ad->batch_queue[page][optype]);
+
+	ad->aggr_buckets = kzalloc(sizeof(*ad->aggr_buckets), GFP_KERNEL);
+	if (!ad->aggr_buckets) {
+		pr_err("adios: Failed to allocate aggregation buckets\n");
+		goto destroy_dl_group_pool;
+	}
+
+	for (optype = 0; optype < ADIOS_OPTYPES; optype++) {
+		struct latency_model *model = &ad->latency_model[optype];
+		struct latency_model_params *params;
+
+		spin_lock_init(&model->update_lock);
+		params = kzalloc(sizeof(*params), GFP_KERNEL);
+		if (!params) {
+			pr_err("adios: Failed to allocate latency_model_params\n");
+			goto free_buckets;
+		}
+		params->last_update_jiffies = jiffies;
+		RCU_INIT_POINTER(model->params, params);
+
+		model->pcpu_buckets = alloc_percpu(struct lm_buckets);
+		if (!model->pcpu_buckets) {
+			pr_err("adios: Failed to allocate per-CPU buckets\n");
+			kfree(params);
+			goto free_buckets;
+		}
+
+		model->lm_shrink_at_kreqs  = default_lm_shrink_at_kreqs;
+		model->lm_shrink_at_gbytes = default_lm_shrink_at_gbytes;
+		model->lm_shrink_resist    = default_lm_shrink_resist;
+	}
+
+	for (optype = 0; optype < ADIOS_OPTYPES; optype++) {
+		ad->latency_target[optype] = default_latency_target[optype];
+		ad->batch_limit[optype] = default_batch_limit[optype];
+	}
+
+	eq->elevator_data = ad;
+
+	ad->is_rotational = !!(q->limits.features & BLK_FEAT_ROTATIONAL);
+	ad->global_latency_window = (ad->is_rotational)?
+		default_global_latency_window_rotational:
+		default_global_latency_window;
+	ad->bq_refill_below_ratio = default_bq_refill_below_ratio;
+	ad->lat_model_latency_limit = default_lat_model_latency_limit;
+	ad->batch_order = default_batch_order;
+	ad->compliance_flags = default_compliance_flags;
+
+	ad->insert_request_fn = insert_request_pre_stability;
+
+	atomic_set(&ad->state, 0);
+
+	spin_lock_init(&ad->lock);
+	spin_lock_init(&ad->pq_lock);
+	spin_lock_init(&ad->bq_lock);
+	spin_lock_init(&ad->barrier_lock);
+	INIT_LIST_HEAD(&ad->barrier_queue);
+
+	timer_setup(&ad->update_timer, update_timer_callback, 0);
+
+	/* We dispatch from request queue wide instead of hw queue */
+	blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q);
+
+	ad->queue = q;
+	blk_stat_enable_accounting(q);
+
+	q->elevator = eq;
+	adios_depth_updated(q);
+	return 0;
+
+free_buckets:
+	pr_err("adios: Failed to allocate per-cpu buckets\n");
+	while (optype-- > 0) {
+		struct latency_model *prev_model = &ad->latency_model[optype];
+		kfree(rcu_access_pointer(prev_model->params));
+		free_percpu(prev_model->pcpu_buckets);
+	}
+	kfree(ad->aggr_buckets);
+destroy_dl_group_pool:
+	kmem_cache_destroy(ad->dl_group_pool);
+destroy_rq_data_pool:
+	kmem_cache_destroy(ad->rq_data_pool);
+free_ad:
+	kfree(ad);
+put_eq:
+	kobject_put(&eq->kobj);
+	return ret;
+}
+
+// Clean up and free resources when exiting the scheduler
+static void adios_exit_sched(struct elevator_queue *e) {
+	struct adios_data *ad = e->elevator_data;
+
+	timer_shutdown_sync(&ad->update_timer);
+
+	WARN_ON_ONCE(!list_empty(&ad->barrier_queue));
+	for (int i = 0; i < 2; i++)
+		WARN_ON_ONCE(!list_empty(&ad->prio_queue[i]));
+
+	for (u8 i = 0; i < ADIOS_OPTYPES; i++) {
+		struct latency_model *model = &ad->latency_model[i];
+		struct latency_model_params *params = rcu_access_pointer(model->params);
+
+		RCU_INIT_POINTER(model->params, NULL);
+		kfree_rcu(params, rcu);
+
+		free_percpu(model->pcpu_buckets);
+	}
+
+	synchronize_rcu();
+
+	kfree(ad->aggr_buckets);
+
+	if (ad->rq_data_pool)
+		kmem_cache_destroy(ad->rq_data_pool);
+
+	if (ad->dl_group_pool)
+		kmem_cache_destroy(ad->dl_group_pool);
+
+	blk_stat_disable_accounting(ad->queue);
+
+	kfree(ad);
+}
+
+static void sideload_latency_model(
+		struct latency_model *model, u64 base, u64 slope) {
+	struct latency_model_params *old_params, *new_params;
+	unsigned long flags;
+
+	new_params = kzalloc(sizeof(*new_params), GFP_KERNEL);
+	if (!new_params)
+		return;
+
+	spin_lock_irqsave(&model->update_lock, flags);
+
+	old_params = rcu_dereference_protected(model->params,
+			lockdep_is_held(&model->update_lock));
+
+	new_params->last_update_jiffies = jiffies;
+
+	// Initialize base and its statistics as a single sample.
+	new_params->base = base;
+	new_params->small_sum_delay = base;
+	new_params->small_count = 1;
+
+	// Initialize slope and its statistics as a single sample.
+	new_params->slope = slope;
+	new_params->large_sum_delay = slope;
+	new_params->large_sum_bsize = 1024; /* Corresponds to 1 KiB */
+
+	lm_reset_pcpu_buckets(model);
+
+	rcu_assign_pointer(model->params, new_params);
+	spin_unlock_irqrestore(&model->update_lock, flags);
+
+	kfree_rcu(old_params, rcu);
+}
+
+// Define sysfs attributes for operation types
+#define SYSFS_OPTYPE_DECL(name, optype) \
+static ssize_t adios_lat_model_##name##_show( \
+		struct elevator_queue *e, char *page) { \
+	struct adios_data *ad = e->elevator_data; \
+	struct latency_model *model = &ad->latency_model[optype]; \
+	struct latency_model_params *params; \
+	ssize_t len = 0; \
+	u64 base, slope; \
+	rcu_read_lock(); \
+	params = rcu_dereference(model->params); \
+	base = params->base; \
+	slope = params->slope; \
+	rcu_read_unlock(); \
+	len += sprintf(page,       "base : %llu ns\n", base); \
+	len += sprintf(page + len, "slope: %llu ns/KiB\n", slope); \
+	return len; \
+} \
+static ssize_t adios_lat_model_##name##_store( \
+		struct elevator_queue *e, const char *page, size_t count) { \
+	struct adios_data *ad = e->elevator_data; \
+	struct latency_model *model = &ad->latency_model[optype]; \
+	u64 base, slope; \
+	int ret; \
+	ret = sscanf(page, "%llu %llu", &base, &slope); \
+	if (ret != 2) \
+		return -EINVAL; \
+	sideload_latency_model(model, base, slope); \
+	reset_buckets(ad->aggr_buckets); \
+	return count; \
+} \
+static ssize_t adios_lat_target_##name##_show( \
+		struct elevator_queue *e, char *page) { \
+	struct adios_data *ad = e->elevator_data; \
+	return sprintf(page, "%llu\n", ad->latency_target[optype]); \
+} \
+static ssize_t adios_lat_target_##name##_store( \
+		struct elevator_queue *e, const char *page, size_t count) { \
+	struct adios_data *ad = e->elevator_data; \
+	unsigned long nsec; \
+	int ret; \
+	ret = kstrtoul(page, 10, &nsec); \
+	if (ret) \
+		return ret; \
+	sideload_latency_model(&ad->latency_model[optype], 0, 0); \
+	ad->latency_target[optype] = nsec; \
+	return count; \
+} \
+static ssize_t adios_batch_limit_##name##_show( \
+		struct elevator_queue *e, char *page) { \
+	struct adios_data *ad = e->elevator_data; \
+	return sprintf(page, "%u\n", ad->batch_limit[optype]); \
+} \
+static ssize_t adios_batch_limit_##name##_store( \
+		struct elevator_queue *e, const char *page, size_t count) { \
+	unsigned long max_batch; \
+	int ret; \
+	ret = kstrtoul(page, 10, &max_batch); \
+	if (ret || max_batch == 0) \
+		return -EINVAL; \
+	struct adios_data *ad = e->elevator_data; \
+	ad->batch_limit[optype] = max_batch; \
+	return count; \
+}
+
+SYSFS_OPTYPE_DECL(read, ADIOS_READ);
+SYSFS_OPTYPE_DECL(write, ADIOS_WRITE);
+SYSFS_OPTYPE_DECL(discard, ADIOS_DISCARD);
+
+// Show the maximum batch size actually achieved for each operation type
+static ssize_t adios_batch_actual_max_show(
+		struct elevator_queue *e, char *page) {
+	struct adios_data *ad = e->elevator_data;
+	u32 total_count, read_count, write_count, discard_count;
+
+	total_count = ad->batch_actual_max_total;
+	read_count = ad->batch_actual_max_size[ADIOS_READ];
+	write_count = ad->batch_actual_max_size[ADIOS_WRITE];
+	discard_count = ad->batch_actual_max_size[ADIOS_DISCARD];
+
+	return sprintf(page,
+		"Total  : %u\nDiscard: %u\nRead   : %u\nWrite  : %u\n",
+		total_count, discard_count, read_count, write_count);
+}
+
+#define SYSFS_ULL_DECL(field, min_val, max_val) \
+static ssize_t adios_##field##_show( \
+		struct elevator_queue *e, char *page) { \
+	struct adios_data *ad = e->elevator_data; \
+	return sprintf(page, "%llu\n", ad->field); \
+} \
+static ssize_t adios_##field##_store( \
+		struct elevator_queue *e, const char *page, size_t count) { \
+	struct adios_data *ad = e->elevator_data; \
+	unsigned long val; \
+	int ret; \
+	ret = kstrtoul(page, 10, &val); \
+	if (ret || val < (min_val) || val > (max_val)) \
+		return -EINVAL; \
+	ad->field = val; \
+	return count; \
+}
+
+SYSFS_ULL_DECL(global_latency_window, 0, ULLONG_MAX)
+SYSFS_ULL_DECL(compliance_flags, 0, ULLONG_MAX)
+
+#define SYSFS_INT_DECL(field, min_val, max_val) \
+static ssize_t adios_##field##_show( \
+		struct elevator_queue *e, char *page) { \
+	struct adios_data *ad = e->elevator_data; \
+	return sprintf(page, "%d\n", ad->field); \
+} \
+static ssize_t adios_##field##_store( \
+		struct elevator_queue *e, const char *page, size_t count) { \
+	struct adios_data *ad = e->elevator_data; \
+	int val; \
+	int ret; \
+	ret = kstrtoint(page, 10, &val); \
+	if (ret || val < (min_val) || val > (max_val)) \
+		return -EINVAL; \
+	ad->field = val; \
+	return count; \
+}
+
+SYSFS_INT_DECL(bq_refill_below_ratio, 0, 100)
+SYSFS_INT_DECL(lat_model_latency_limit, 0, 2*NSEC_PER_SEC)
+SYSFS_INT_DECL(batch_order, ADIOS_BO_OPTYPE, !!ad->is_rotational)
+
+// Show the read priority
+static ssize_t adios_read_priority_show(
+		struct elevator_queue *e, char *page) {
+	struct adios_data *ad = e->elevator_data;
+	return sprintf(page, "%d\n", ad->dl_prio[0]);
+}
+
+// Set the read priority
+static ssize_t adios_read_priority_store(
+		struct elevator_queue *e, const char *page, size_t count) {
+	struct adios_data *ad = e->elevator_data;
+	int prio;
+	int ret;
+
+	ret = kstrtoint(page, 10, &prio);
+	if (ret || prio < -20 || prio > 19)
+		return -EINVAL;
+
+	guard(spinlock_irqsave)(&ad->lock);
+	ad->dl_prio[0] = prio;
+	ad->dl_bias = 0;
+
+	return count;
+}
+
+// Reset batch queue statistics
+static ssize_t adios_reset_bq_stats_store(
+		struct elevator_queue *e, const char *page, size_t count) {
+	struct adios_data *ad = e->elevator_data;
+	unsigned long val;
+	int ret;
+
+	ret = kstrtoul(page, 10, &val);
+	if (ret || val != 1)
+		return -EINVAL;
+
+	for (u8 i = 0; i < ADIOS_OPTYPES; i++)
+		ad->batch_actual_max_size[i] = 0;
+
+	ad->batch_actual_max_total = 0;
+
+	return count;
+}
+
+// Reset the latency model parameters or load them from user input
+static ssize_t adios_reset_lat_model_store(
+		struct elevator_queue *e, const char *page, size_t count)
+{
+	struct adios_data *ad = e->elevator_data;
+	struct latency_model *model;
+	int ret;
+
+	/*
+	 * Differentiate between two modes based on input format:
+	 * 1. "1": Fully reset the model (backward compatibility).
+	 * 2. "R_base R_slope W_base W_slope D_base D_slope": Load values.
+	 */
+	if (!strchr(page, ' ')) {
+		// Mode 1: Full reset.
+		unsigned long val;
+
+		ret = kstrtoul(page, 10, &val);
+		if (ret || val != 1)
+			return -EINVAL;
+
+		for (u8 i = 0; i < ADIOS_OPTYPES; i++) {
+			model = &ad->latency_model[i];
+			sideload_latency_model(model, 0, 0);
+		}
+	} else {
+		// Mode 2: Load initial values for all latency models.
+		u64 params[3][2]; /* 0:base, 1:slope for R, W, D */
+
+		ret = sscanf(page, "%llu %llu %llu %llu %llu %llu",
+			&params[ADIOS_READ   ][0], &params[ADIOS_READ   ][1],
+			&params[ADIOS_WRITE  ][0], &params[ADIOS_WRITE  ][1],
+			&params[ADIOS_DISCARD][0], &params[ADIOS_DISCARD][1]);
+
+		if (ret != 6)
+			return -EINVAL;
+
+		for (u8 i = ADIOS_READ; i <= ADIOS_DISCARD; i++) {
+			model = &ad->latency_model[i];
+			sideload_latency_model(model, params[i][0], params[i][1]);
+		}
+	}
+	reset_buckets(ad->aggr_buckets);
+
+	return count;
+}
+
+// Show the ADIOS version
+static ssize_t adios_version_show(struct elevator_queue *e, char *page) {
+	return sprintf(page, "%s\n", ADIOS_VERSION);
+}
+
+// Define sysfs attributes for dynamic thresholds
+#define SHRINK_THRESHOLD_ATTR_RW(name, model_field, min_value, max_value) \
+static ssize_t adios_shrink_##name##_store( \
+		struct elevator_queue *e, const char *page, size_t count) { \
+	struct adios_data *ad = e->elevator_data; \
+	unsigned long val; \
+	int ret; \
+	ret = kstrtoul(page, 10, &val); \
+	if (ret || val < min_value || val > max_value) \
+		return -EINVAL; \
+	for (u8 i = 0; i < ADIOS_OPTYPES; i++) { \
+		struct latency_model *model = &ad->latency_model[i]; \
+		unsigned long flags; \
+		spin_lock_irqsave(&model->update_lock, flags); \
+		model->model_field = val; \
+		spin_unlock_irqrestore(&model->update_lock, flags); \
+	} \
+	return count; \
+} \
+static ssize_t adios_shrink_##name##_show( \
+		struct elevator_queue *e, char *page) { \
+	struct adios_data *ad = e->elevator_data; \
+	u32 val = 0; \
+	unsigned long flags; \
+	struct latency_model *model = &ad->latency_model[0]; \
+	spin_lock_irqsave(&model->update_lock, flags); \
+	val = model->model_field; \
+	spin_unlock_irqrestore(&model->update_lock, flags); \
+	return sprintf(page, "%u\n", val); \
+}
+
+SHRINK_THRESHOLD_ATTR_RW(at_kreqs,  lm_shrink_at_kreqs,  1, 100000)
+SHRINK_THRESHOLD_ATTR_RW(at_gbytes, lm_shrink_at_gbytes, 1,   1000)
+SHRINK_THRESHOLD_ATTR_RW(resist,    lm_shrink_resist,    1,      3)
+
+// Define sysfs attributes
+#define AD_ATTR(name, show_func, store_func) \
+	__ATTR(name, 0644, show_func, store_func)
+#define AD_ATTR_RW(name) \
+	__ATTR(name, 0644, adios_##name##_show, adios_##name##_store)
+#define AD_ATTR_RO(name) \
+	__ATTR(name, 0444, adios_##name##_show, NULL)
+#define AD_ATTR_WO(name) \
+	__ATTR(name, 0200, NULL, adios_##name##_store)
+
+// Define sysfs attributes for ADIOS scheduler
+static struct elv_fs_entry adios_sched_attrs[] = {
+	AD_ATTR_RO(batch_actual_max),
+	AD_ATTR_RW(bq_refill_below_ratio),
+	AD_ATTR_RW(global_latency_window),
+	AD_ATTR_RW(lat_model_latency_limit),
+	AD_ATTR_RW(batch_order),
+	AD_ATTR_RW(compliance_flags),
+
+	AD_ATTR_RW(batch_limit_read),
+	AD_ATTR_RW(batch_limit_write),
+	AD_ATTR_RW(batch_limit_discard),
+
+	AD_ATTR_RW(lat_model_read),
+	AD_ATTR_RW(lat_model_write),
+	AD_ATTR_RW(lat_model_discard),
+
+	AD_ATTR_RW(lat_target_read),
+	AD_ATTR_RW(lat_target_write),
+	AD_ATTR_RW(lat_target_discard),
+
+	AD_ATTR_RW(shrink_at_kreqs),
+	AD_ATTR_RW(shrink_at_gbytes),
+	AD_ATTR_RW(shrink_resist),
+
+	AD_ATTR_RW(read_priority),
+
+	AD_ATTR_WO(reset_bq_stats),
+	AD_ATTR_WO(reset_lat_model),
+	AD_ATTR(adios_version, adios_version_show, NULL),
+
+	__ATTR_NULL
+};
+
+// Define the ADIOS scheduler type
+static struct elevator_type mq_adios = {
+	.ops = {
+		.next_request		= elv_rb_latter_request,
+		.former_request		= elv_rb_former_request,
+		.limit_depth		= adios_limit_depth,
+		.depth_updated		= adios_depth_updated,
+		.request_merged		= adios_request_merged,
+		.requests_merged	= adios_merged_requests,
+		.bio_merge			= adios_bio_merge,
+		.insert_requests	= adios_insert_requests,
+		.prepare_request	= adios_prepare_request,
+		.dispatch_request	= adios_dispatch_request,
+		.completed_request	= adios_completed_request,
+		.finish_request		= adios_finish_request,
+		.has_work			= adios_has_work,
+		.init_sched			= adios_init_sched,
+		.exit_sched			= adios_exit_sched,
+	},
+	.elevator_attrs = adios_sched_attrs,
+	.elevator_name = "adios",
+	.elevator_owner = THIS_MODULE,
+};
+MODULE_ALIAS("mq-adios-iosched");
+
+#define ADIOS_PROGNAME "Adaptive Deadline I/O Scheduler"
+#define ADIOS_AUTHOR   "Masahito Suzuki"
+
+// Initialize the ADIOS scheduler module
+static int __init adios_init(void) {
+	printk(KERN_INFO "%s %s by %s\n",
+		ADIOS_PROGNAME, ADIOS_VERSION, ADIOS_AUTHOR);
+	return elv_register(&mq_adios);
+}
+
+// Exit the ADIOS scheduler module
+static void __exit adios_exit(void) {
+	elv_unregister(&mq_adios);
+}
+
+module_init(adios_init);
+module_exit(adios_exit);
+
+MODULE_AUTHOR(ADIOS_AUTHOR);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION(ADIOS_PROGNAME);
\ No newline at end of file
diff --git a/block/elevator.c b/block/elevator.c
index e2ebfbf107b3af..49ede90c131d3b 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -741,7 +741,13 @@ void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e,
 void elevator_set_default(struct request_queue *q)
 {
 	struct elv_change_ctx ctx = {
+#ifdef CONFIG_MQ_IOSCHED_DEFAULT_ADIOS
+		.name = "adios",
+#elif defined(CONFIG_ZEN_INTERACTIVE) && defined(CONFIG_IOSCHED_BFQ)
+		.name = "bfq",
+#else
 		.name = "mq-deadline",
+#endif
 		.no_uevent = true,
 	};
 	int err;
@@ -758,17 +764,25 @@ void elevator_set_default(struct request_queue *q)
 	 * have multiple queues or mq-deadline is not available, default
 	 * to "none".
 	 */
+#ifndef CONFIG_MQ_IOSCHED_DEFAULT_ADIOS
+	bool is_sq = q->nr_hw_queues == 1 || blk_mq_is_shared_tags(q->tag_set->flags);
+	if (!is_sq)
+#if defined(CONFIG_ZEN_INTERACTIVE) && defined(CONFIG_MQ_IOSCHED_KYBER)
+		ctx.name = "kyber";
+#else
+		return;
+#endif
+#endif // CONFIG_MQ_IOSCHED_DEFAULT_ADIOS
+
 	e = elevator_find_get(ctx.name);
 	if (!e)
 		return;
 
-	if ((q->nr_hw_queues == 1 ||
-			blk_mq_is_shared_tags(q->tag_set->flags))) {
-		err = elevator_change(q, &ctx);
-		if (err < 0)
-			pr_warn("\"%s\" elevator initialization, failed %d, falling back to \"none\"\n",
-					ctx.name, err);
-	}
+	err = elevator_change(q, &ctx);
+	if (err < 0)
+		pr_warn("\"%s\" elevator initialization, failed %d, falling back to \"none\"\n",
+				ctx.name, err);
+
 	elevator_put(e);
 }
 
diff --git a/drivers/Makefile b/drivers/Makefile
index 8e1ffa4358d5f1..2a9eec99c1f7c3 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -64,14 +64,8 @@ obj-y				+= char/
 # iommu/ comes before gpu as gpu are using iommu controllers
 obj-y				+= iommu/
 
-# gpu/ comes after char for AGP vs DRM startup and after iommu
-obj-y				+= gpu/
-
 obj-$(CONFIG_CONNECTOR)		+= connector/
 
-# i810fb depends on char/agp/
-obj-$(CONFIG_FB_I810)           += video/fbdev/i810/
-
 obj-$(CONFIG_PARPORT)		+= parport/
 obj-y				+= base/ block/ misc/ mfd/ nfc/
 obj-$(CONFIG_LIBNVDIMM)		+= nvdimm/
@@ -83,6 +77,13 @@ obj-y				+= macintosh/
 obj-y				+= scsi/
 obj-y				+= nvme/
 obj-$(CONFIG_ATA)		+= ata/
+
+# gpu/ comes after char for AGP vs DRM startup and after iommu
+obj-y				+= gpu/
+
+# i810fb depends on char/agp/
+obj-$(CONFIG_FB_I810)           += video/fbdev/i810/
+
 obj-$(CONFIG_TARGET_CORE)	+= target/
 obj-$(CONFIG_MTD)		+= mtd/
 obj-$(CONFIG_SPI)		+= spi/
diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index 7a7f88b3fa2b18..cb26ab099da2bd 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -1672,7 +1672,7 @@ static irqreturn_t ahci_thunderx_irq_handler(int irq, void *dev_instance)
 }
 #endif
 
-static void ahci_remap_check(struct pci_dev *pdev, int bar,
+static int ahci_remap_check(struct pci_dev *pdev, int bar,
 		struct ahci_host_priv *hpriv)
 {
 	int i;
@@ -1685,7 +1685,7 @@ static void ahci_remap_check(struct pci_dev *pdev, int bar,
 	    pci_resource_len(pdev, bar) < SZ_512K ||
 	    bar != AHCI_PCI_BAR_STANDARD ||
 	    !(readl(hpriv->mmio + AHCI_VSCAP) & 1))
-		return;
+		return 0;
 
 	cap = readq(hpriv->mmio + AHCI_REMAP_CAP);
 	for (i = 0; i < AHCI_MAX_REMAP; i++) {
@@ -1700,18 +1700,11 @@ static void ahci_remap_check(struct pci_dev *pdev, int bar,
 	}
 
 	if (!hpriv->remapped_nvme)
-		return;
-
-	dev_warn(&pdev->dev, "Found %u remapped NVMe devices.\n",
-		 hpriv->remapped_nvme);
-	dev_warn(&pdev->dev,
-		 "Switch your BIOS from RAID to AHCI mode to use them.\n");
+		return 0;
 
-	/*
-	 * Don't rely on the msi-x capability in the remap case,
-	 * share the legacy interrupt across ahci and remapped devices.
-	 */
-	hpriv->flags |= AHCI_HFLAG_NO_MSI;
+	/* Abort probe, allowing intel-nvme-remap to step in when available */
+	dev_info(&pdev->dev, "Device will be handled by intel-nvme-remap.\n");
+	return -ENODEV;
 }
 
 static int ahci_get_irq_vector(struct ata_host *host, int port)
@@ -1975,7 +1968,9 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 		return -ENOMEM;
 
 	/* detect remapped nvme devices */
-	ahci_remap_check(pdev, ahci_pci_bar, hpriv);
+	rc = ahci_remap_check(pdev, ahci_pci_bar, hpriv);
+	if (rc)
+		return rc;
 
 	sysfs_add_file_to_group(&pdev->dev.kobj,
 				&dev_attr_remapped_nvme.attr,
diff --git a/drivers/ata/libahci.c b/drivers/ata/libahci.c
index c79abdfcd7a9b0..59acfa77934b9c 100644
--- a/drivers/ata/libahci.c
+++ b/drivers/ata/libahci.c
@@ -34,7 +34,7 @@
 #include "libata.h"
 
 static int ahci_skip_host_reset;
-int ahci_ignore_sss;
+int ahci_ignore_sss = 1;
 EXPORT_SYMBOL_GPL(ahci_ignore_sss);
 
 module_param_named(skip_host_reset, ahci_skip_host_reset, int, 0444);
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index a6ecc203f7b7f3..46ea23cbb75437 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -18,10 +18,16 @@
 #include "cpufreq_ondemand.h"
 
 /* On-demand governor macros */
+#if defined(CONFIG_ZEN_INTERACTIVE)
+#define DEF_FREQUENCY_UP_THRESHOLD		(55)
+#define MICRO_FREQUENCY_UP_THRESHOLD		(60)
+#define DEF_SAMPLING_DOWN_FACTOR		(5)
+#else
 #define DEF_FREQUENCY_UP_THRESHOLD		(80)
+#define MICRO_FREQUENCY_UP_THRESHOLD		(95)
 #define DEF_SAMPLING_DOWN_FACTOR		(1)
+#endif
 #define MAX_SAMPLING_DOWN_FACTOR		(100000)
-#define MICRO_FREQUENCY_UP_THRESHOLD		(95)
 #define MIN_FREQUENCY_UP_THRESHOLD		(1)
 #define MAX_FREQUENCY_UP_THRESHOLD		(100)
 
diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c
index 90ff6be85cf466..15159c1cf6e1a0 100644
--- a/drivers/input/evdev.c
+++ b/drivers/input/evdev.c
@@ -46,6 +46,7 @@ struct evdev_client {
 	struct fasync_struct *fasync;
 	struct evdev *evdev;
 	struct list_head node;
+	struct rcu_head rcu;
 	enum input_clock_type clk_type;
 	bool revoked;
 	unsigned long *evmasks[EV_CNT];
@@ -368,13 +369,22 @@ static void evdev_attach_client(struct evdev *evdev,
 	spin_unlock(&evdev->client_lock);
 }
 
+static void evdev_reclaim_client(struct rcu_head *rp)
+{
+	struct evdev_client *client = container_of(rp, struct evdev_client, rcu);
+	unsigned int i;
+	for (i = 0; i < EV_CNT; ++i)
+		bitmap_free(client->evmasks[i]);
+	kvfree(client);
+}
+
 static void evdev_detach_client(struct evdev *evdev,
 				struct evdev_client *client)
 {
 	spin_lock(&evdev->client_lock);
 	list_del_rcu(&client->node);
 	spin_unlock(&evdev->client_lock);
-	synchronize_rcu();
+	call_rcu(&client->rcu, evdev_reclaim_client);
 }
 
 static int evdev_open_device(struct evdev *evdev)
@@ -427,7 +437,6 @@ static int evdev_release(struct inode *inode, struct file *file)
 {
 	struct evdev_client *client = file->private_data;
 	struct evdev *evdev = client->evdev;
-	unsigned int i;
 
 	mutex_lock(&evdev->mutex);
 
@@ -439,11 +448,6 @@ static int evdev_release(struct inode *inode, struct file *file)
 
 	evdev_detach_client(evdev, client);
 
-	for (i = 0; i < EV_CNT; ++i)
-		bitmap_free(client->evmasks[i]);
-
-	kvfree(client);
-
 	evdev_close_device(evdev);
 
 	return 0;
@@ -486,7 +490,6 @@ static int evdev_open(struct inode *inode, struct file *file)
 
  err_free_client:
 	evdev_detach_client(evdev, client);
-	kvfree(client);
 	return error;
 }
 
diff --git a/drivers/pci/controller/Makefile b/drivers/pci/controller/Makefile
index 038ccbd9e3ba23..de5e4f5145af8d 100644
--- a/drivers/pci/controller/Makefile
+++ b/drivers/pci/controller/Makefile
@@ -1,4 +1,10 @@
 # SPDX-License-Identifier: GPL-2.0
+ifdef CONFIG_X86_64
+ifdef CONFIG_SATA_AHCI
+obj-y += intel-nvme-remap.o
+endif
+endif
+
 obj-$(CONFIG_PCIE_CADENCE) += cadence/
 obj-$(CONFIG_PCI_FTPCI100) += pci-ftpci100.o
 obj-$(CONFIG_PCI_IXP4XX) += pci-ixp4xx.o
diff --git a/drivers/pci/controller/intel-nvme-remap.c b/drivers/pci/controller/intel-nvme-remap.c
new file mode 100644
index 00000000000000..e105e6f5cc91d1
--- /dev/null
+++ b/drivers/pci/controller/intel-nvme-remap.c
@@ -0,0 +1,462 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel remapped NVMe device support.
+ *
+ * Copyright (c) 2019 Endless Mobile, Inc.
+ * Author: Daniel Drake <drake@endlessm.com>
+ *
+ * Some products ship by default with the SATA controller in "RAID" or
+ * "Intel RST Premium With Intel Optane System Acceleration" mode. Under this
+ * mode, which we refer to as "remapped NVMe" mode, any installed NVMe
+ * devices disappear from the PCI bus, and instead their I/O memory becomes
+ * available within the AHCI device BARs.
+ *
+ * This scheme is understood to be a way of avoiding usage of the standard
+ * Windows NVMe driver under that OS, instead mandating usage of Intel's
+ * driver instead, which has better power management, and presumably offers
+ * some RAID/disk-caching solutions too.
+ *
+ * Here in this driver, we support the remapped NVMe mode by claiming the
+ * AHCI device and creating a fake PCIe root port. On the new bus, the
+ * original AHCI device is exposed with only minor tweaks. Then, fake PCI
+ * devices corresponding to the remapped NVMe devices are created. The usual
+ * ahci and nvme drivers are then expected to bind to these devices and
+ * operate as normal.
+ *
+ * The PCI configuration space for the NVMe devices is completely
+ * unavailable, so we fake a minimal one and hope for the best.
+ *
+ * Interrupts are shared between the AHCI and NVMe devices. For simplicity,
+ * we only support the legacy interrupt here, although MSI support
+ * could potentially be added later.
+ */
+
+#define MODULE_NAME "intel-nvme-remap"
+
+#include <linux/ahci-remap.h>
+#include <linux/irq.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+
+#define AHCI_PCI_BAR_STANDARD 5
+
+struct nvme_remap_dev {
+	struct pci_dev		*dev;		/* AHCI device */
+	struct pci_bus		*bus;		/* our fake PCI bus */
+	struct pci_sysdata	sysdata;
+	int			irq_base;	/* our fake interrupts */
+
+	/*
+	 * When we detect an all-ones write to a BAR register, this flag
+	 * is set, so that we return the BAR size on the next read (a
+	 * standard PCI behaviour).
+	 * This includes the assumption that an all-ones BAR write is
+	 * immediately followed by a read of the same register.
+	 */
+	bool			bar_sizing;
+
+	/*
+	 * Resources copied from the AHCI device, to be regarded as
+	 * resources on our fake bus.
+	 */
+	struct resource		ahci_resources[PCI_NUM_RESOURCES];
+
+	/* Resources corresponding to the NVMe devices. */
+	struct resource		remapped_dev_mem[AHCI_MAX_REMAP];
+
+	/* Number of remapped NVMe devices found. */
+	int			num_remapped_devices;
+};
+
+static inline struct nvme_remap_dev *nrdev_from_bus(struct pci_bus *bus)
+{
+	return container_of(bus->sysdata, struct nvme_remap_dev, sysdata);
+}
+
+
+/******** PCI configuration space **********/
+
+/*
+ * Helper macros for tweaking returned contents of PCI configuration space.
+ *
+ * value contains len bytes of data read from reg.
+ * If fixup_reg is included in that range, fix up the contents of that
+ * register to fixed_value.
+ */
+#define NR_FIX8(fixup_reg, fixed_value) do { \
+		if (reg <= fixup_reg && fixup_reg < reg + len) \
+			((u8 *) value)[fixup_reg - reg] = (u8) (fixed_value); \
+	} while (0)
+
+#define NR_FIX16(fixup_reg, fixed_value) do { \
+		NR_FIX8(fixup_reg, fixed_value); \
+		NR_FIX8(fixup_reg + 1, fixed_value >> 8); \
+	} while (0)
+
+#define NR_FIX24(fixup_reg, fixed_value) do { \
+		NR_FIX8(fixup_reg, fixed_value); \
+		NR_FIX8(fixup_reg + 1, fixed_value >> 8); \
+		NR_FIX8(fixup_reg + 2, fixed_value >> 16); \
+	} while (0)
+
+#define NR_FIX32(fixup_reg, fixed_value) do { \
+		NR_FIX16(fixup_reg, (u16) fixed_value); \
+		NR_FIX16(fixup_reg + 2, fixed_value >> 16); \
+	} while (0)
+
+/*
+ * Read PCI config space of the slot 0 (AHCI) device.
+ * We pass through the read request to the underlying device, but
+ * tweak the results in some cases.
+ */
+static int nvme_remap_pci_read_slot0(struct pci_bus *bus, int reg,
+				     int len, u32 *value)
+{
+	struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
+	struct pci_bus *ahci_dev_bus = nrdev->dev->bus;
+	int ret;
+
+	ret = ahci_dev_bus->ops->read(ahci_dev_bus, nrdev->dev->devfn,
+				      reg, len, value);
+	if (ret)
+		return ret;
+
+	/*
+	 * Adjust the device class, to prevent this driver from attempting to
+	 * additionally probe the device we're simulating here.
+	 */
+	NR_FIX24(PCI_CLASS_PROG, PCI_CLASS_STORAGE_SATA_AHCI);
+
+	/*
+	 * Unset interrupt pin, otherwise ACPI tries to find routing
+	 * info for our virtual IRQ, fails, and complains.
+	 */
+	NR_FIX8(PCI_INTERRUPT_PIN, 0);
+
+	/*
+	 * Truncate the AHCI BAR to not include the region that covers the
+	 * hidden devices. This will cause the ahci driver to successfully
+	 * probe th new device (instead of handing it over to this driver).
+	 */
+	if (nrdev->bar_sizing) {
+		NR_FIX32(PCI_BASE_ADDRESS_5, ~(SZ_16K - 1));
+		nrdev->bar_sizing = false;
+	}
+
+	return PCIBIOS_SUCCESSFUL;
+}
+
+/*
+ * Read PCI config space of a remapped device.
+ * Since the original PCI config space is inaccessible, we provide a minimal,
+ * fake config space instead.
+ */
+static int nvme_remap_pci_read_remapped(struct pci_bus *bus, unsigned int port,
+					int reg, int len, u32 *value)
+{
+	struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
+	struct resource *remapped_mem;
+
+	if (port > nrdev->num_remapped_devices)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	*value = 0;
+	remapped_mem = &nrdev->remapped_dev_mem[port - 1];
+
+	/* Set a Vendor ID, otherwise Linux assumes no device is present */
+	NR_FIX16(PCI_VENDOR_ID, PCI_VENDOR_ID_INTEL);
+
+	/* Always appear on & bus mastering */
+	NR_FIX16(PCI_COMMAND, PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER);
+
+	/* Set class so that nvme driver probes us */
+	NR_FIX24(PCI_CLASS_PROG, PCI_CLASS_STORAGE_EXPRESS);
+
+	if (nrdev->bar_sizing) {
+		NR_FIX32(PCI_BASE_ADDRESS_0,
+			 ~(resource_size(remapped_mem) - 1));
+		nrdev->bar_sizing = false;
+	} else {
+		resource_size_t mem_start = remapped_mem->start;
+
+		mem_start |= PCI_BASE_ADDRESS_MEM_TYPE_64;
+		NR_FIX32(PCI_BASE_ADDRESS_0, mem_start);
+		mem_start >>= 32;
+		NR_FIX32(PCI_BASE_ADDRESS_1, mem_start);
+	}
+
+	return PCIBIOS_SUCCESSFUL;
+}
+
+/* Read PCI configuration space. */
+static int nvme_remap_pci_read(struct pci_bus *bus, unsigned int devfn,
+			       int reg, int len, u32 *value)
+{
+	if (PCI_SLOT(devfn) == 0)
+		return nvme_remap_pci_read_slot0(bus, reg, len, value);
+	else
+		return nvme_remap_pci_read_remapped(bus, PCI_SLOT(devfn),
+						    reg, len, value);
+}
+
+/*
+ * Write PCI config space of the slot 0 (AHCI) device.
+ * Apart from the special case of BAR sizing, we disable all writes.
+ * Otherwise, the ahci driver could make changes (e.g. unset PCI bus master)
+ * that would affect the operation of the NVMe devices.
+ */
+static int nvme_remap_pci_write_slot0(struct pci_bus *bus, int reg,
+				      int len, u32 value)
+{
+	struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
+	struct pci_bus *ahci_dev_bus = nrdev->dev->bus;
+
+	if (reg >= PCI_BASE_ADDRESS_0 && reg <= PCI_BASE_ADDRESS_5) {
+		/*
+		 * Writing all-ones to a BAR means that the size of the
+		 * memory region is being checked. Flag this so that we can
+		 * reply with an appropriate size on the next read.
+		 */
+		if (value == ~0)
+			nrdev->bar_sizing = true;
+
+		return ahci_dev_bus->ops->write(ahci_dev_bus,
+						nrdev->dev->devfn,
+						reg, len, value);
+	}
+
+	return PCIBIOS_SET_FAILED;
+}
+
+/*
+ * Write PCI config space of a remapped device.
+ * Since the original PCI config space is inaccessible, we reject all
+ * writes, except for the special case of BAR probing.
+ */
+static int nvme_remap_pci_write_remapped(struct pci_bus *bus,
+					 unsigned int port,
+					 int reg, int len, u32 value)
+{
+	struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
+
+	if (port > nrdev->num_remapped_devices)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	/*
+	 * Writing all-ones to a BAR means that the size of the memory
+	 * region is being checked. Flag this so that we can reply with
+	 * an appropriate size on the next read.
+	 */
+	if (value == ~0 && reg >= PCI_BASE_ADDRESS_0
+			&& reg <= PCI_BASE_ADDRESS_5) {
+		nrdev->bar_sizing = true;
+		return PCIBIOS_SUCCESSFUL;
+	}
+
+	return PCIBIOS_SET_FAILED;
+}
+
+/* Write PCI configuration space. */
+static int nvme_remap_pci_write(struct pci_bus *bus, unsigned int devfn,
+				int reg, int len, u32 value)
+{
+	if (PCI_SLOT(devfn) == 0)
+		return nvme_remap_pci_write_slot0(bus, reg, len, value);
+	else
+		return nvme_remap_pci_write_remapped(bus, PCI_SLOT(devfn),
+						     reg, len, value);
+}
+
+static struct pci_ops nvme_remap_pci_ops = {
+	.read	= nvme_remap_pci_read,
+	.write	= nvme_remap_pci_write,
+};
+
+
+/******** Initialization & exit **********/
+
+/*
+ * Find a PCI domain ID to use for our fake bus.
+ * Start at 0x10000 to not clash with ACPI _SEG domains (16 bits).
+ */
+static int find_free_domain(void)
+{
+	int domain = 0xffff;
+	struct pci_bus *bus = NULL;
+
+	while ((bus = pci_find_next_bus(bus)) != NULL)
+		domain = max_t(int, domain, pci_domain_nr(bus));
+
+	return domain + 1;
+}
+
+static int find_remapped_devices(struct nvme_remap_dev *nrdev,
+				 struct list_head *resources)
+{
+	void __iomem *mmio;
+	int i, count = 0;
+	u32 cap;
+
+	mmio = pcim_iomap(nrdev->dev, AHCI_PCI_BAR_STANDARD,
+			  pci_resource_len(nrdev->dev,
+					   AHCI_PCI_BAR_STANDARD));
+	if (!mmio)
+		return -ENODEV;
+
+	/* Check if this device might have remapped nvme devices. */
+	if (pci_resource_len(nrdev->dev, AHCI_PCI_BAR_STANDARD) < SZ_512K ||
+	    !(readl(mmio + AHCI_VSCAP) & 1))
+		return -ENODEV;
+
+	cap = readq(mmio + AHCI_REMAP_CAP);
+	for (i = AHCI_MAX_REMAP-1; i >= 0; i--) {
+		struct resource *remapped_mem;
+
+		if ((cap & (1 << i)) == 0)
+			continue;
+		if (readl(mmio + ahci_remap_dcc(i))
+				!= PCI_CLASS_STORAGE_EXPRESS)
+			continue;
+
+		/* We've found a remapped device */
+		remapped_mem = &nrdev->remapped_dev_mem[count++];
+		remapped_mem->start =
+			pci_resource_start(nrdev->dev, AHCI_PCI_BAR_STANDARD)
+			+ ahci_remap_base(i);
+		remapped_mem->end = remapped_mem->start
+			+ AHCI_REMAP_N_SIZE - 1;
+		remapped_mem->flags = IORESOURCE_MEM | IORESOURCE_PCI_FIXED;
+		pci_add_resource(resources, remapped_mem);
+	}
+
+	pcim_iounmap(nrdev->dev, mmio);
+
+	if (count == 0)
+		return -ENODEV;
+
+	nrdev->num_remapped_devices = count;
+	dev_info(&nrdev->dev->dev, "Found %d remapped NVMe devices\n",
+		 nrdev->num_remapped_devices);
+	return 0;
+}
+
+static void nvme_remap_remove_root_bus(void *data)
+{
+	struct pci_bus *bus = data;
+
+	pci_stop_root_bus(bus);
+	pci_remove_root_bus(bus);
+}
+
+static int nvme_remap_probe(struct pci_dev *dev,
+			    const struct pci_device_id *id)
+{
+	struct nvme_remap_dev *nrdev;
+	LIST_HEAD(resources);
+	int i;
+	int ret;
+	struct pci_dev *child;
+
+	nrdev = devm_kzalloc(&dev->dev, sizeof(*nrdev), GFP_KERNEL);
+	nrdev->sysdata.domain = find_free_domain();
+	nrdev->sysdata.nvme_remap_dev = dev;
+	nrdev->dev = dev;
+	pci_set_drvdata(dev, nrdev);
+
+	ret = pcim_enable_device(dev);
+	if (ret < 0)
+		return ret;
+
+	pci_set_master(dev);
+
+	ret = find_remapped_devices(nrdev, &resources);
+	if (ret)
+		return ret;
+
+	/* Add resources from the original AHCI device */
+	for (i = 0; i < PCI_NUM_RESOURCES; i++) {
+		struct resource *res = &dev->resource[i];
+
+		if (res->start) {
+			struct resource *nr_res = &nrdev->ahci_resources[i];
+
+			nr_res->start = res->start;
+			nr_res->end = res->end;
+			nr_res->flags = res->flags;
+			pci_add_resource(&resources, nr_res);
+		}
+	}
+
+	/* Create virtual interrupts */
+	nrdev->irq_base = devm_irq_alloc_descs(&dev->dev, -1, 0,
+					       nrdev->num_remapped_devices + 1,
+					       0);
+	if (nrdev->irq_base < 0)
+		return nrdev->irq_base;
+
+	/* Create and populate PCI bus */
+	nrdev->bus = pci_create_root_bus(&dev->dev, 0, &nvme_remap_pci_ops,
+					 &nrdev->sysdata, &resources);
+	if (!nrdev->bus)
+		return -ENODEV;
+
+	if (devm_add_action_or_reset(&dev->dev, nvme_remap_remove_root_bus,
+				     nrdev->bus))
+		return -ENOMEM;
+
+	/* We don't support sharing MSI interrupts between these devices */
+	nrdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_MSI;
+
+	pci_scan_child_bus(nrdev->bus);
+
+	list_for_each_entry(child, &nrdev->bus->devices, bus_list) {
+		/*
+		 * Prevent PCI core from trying to move memory BARs around.
+		 * The hidden NVMe devices are at fixed locations.
+		 */
+		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
+			struct resource *res = &child->resource[i];
+
+			if (res->flags & IORESOURCE_MEM)
+				res->flags |= IORESOURCE_PCI_FIXED;
+		}
+
+		/* Share the legacy IRQ between all devices */
+		child->irq = dev->irq;
+	}
+
+	pci_assign_unassigned_bus_resources(nrdev->bus);
+	pci_bus_add_devices(nrdev->bus);
+
+	return 0;
+}
+
+static const struct pci_device_id nvme_remap_ids[] = {
+	/*
+	 * Match all Intel RAID controllers.
+	 *
+	 * There's overlap here with the set of devices detected by the ahci
+	 * driver, but ahci will only successfully probe when there
+	 * *aren't* any remapped NVMe devices, and this driver will only
+	 * successfully probe when there *are* remapped NVMe devices that
+	 * need handling.
+	 */
+	{
+		PCI_VDEVICE(INTEL, PCI_ANY_ID),
+		.class = PCI_CLASS_STORAGE_RAID << 8,
+		.class_mask = 0xffffff00,
+	},
+	{0,}
+};
+MODULE_DEVICE_TABLE(pci, nvme_remap_ids);
+
+static struct pci_driver nvme_remap_drv = {
+	.name		= MODULE_NAME,
+	.id_table	= nvme_remap_ids,
+	.probe		= nvme_remap_probe,
+};
+module_pci_driver(nvme_remap_drv);
+
+MODULE_AUTHOR("Daniel Drake <drake@endlessm.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 6299878e3d97e6..8558fb1c26c0a6 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3357,6 +3357,9 @@ static const struct pid_entry tgid_base_stuff[] = {
 	REG("smaps",      S_IRUGO, proc_pid_smaps_operations),
 	REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
 	REG("pagemap",    S_IRUSR, proc_pagemap_operations),
+#ifdef CONFIG_MEM_SOFT_DIRTY
+	REG("pagemap_reset", S_IRUSR, proc_pagemap_reset_operations),
+#endif
 #endif
 #ifdef CONFIG_SECURITY
 	DIR("attr",       S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index d1598576506c1e..5898cd154fd252 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -406,6 +406,7 @@ extern const struct file_operations proc_pid_smaps_operations;
 extern const struct file_operations proc_pid_smaps_rollup_operations;
 extern const struct file_operations proc_clear_refs_operations;
 extern const struct file_operations proc_pagemap_operations;
+extern const struct file_operations proc_pagemap_reset_operations;
 
 extern unsigned long task_vsize(struct mm_struct *);
 extern unsigned long task_statm(struct mm_struct *,
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index fc35a0543f0191..8e0afdd680fcf3 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1580,6 +1580,8 @@ enum clear_refs_types {
 
 struct clear_refs_private {
 	enum clear_refs_types type;
+	unsigned long start, end;
+	bool clear_range;
 };
 
 #ifdef CONFIG_MEM_SOFT_DIRTY
@@ -1600,7 +1602,7 @@ static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr,
 	return folio_maybe_dma_pinned(folio);
 }
 
-static inline void clear_soft_dirty(struct vm_area_struct *vma,
+static inline bool clear_soft_dirty(struct vm_area_struct *vma,
 		unsigned long addr, pte_t *pte)
 {
 	/*
@@ -1610,37 +1612,46 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
 	 * of how soft-dirty works.
 	 */
 	pte_t ptent = ptep_get(pte);
+	bool ret = false;
 
 	if (pte_present(ptent)) {
 		pte_t old_pte;
 
 		if (pte_is_pinned(vma, addr, ptent))
-			return;
+			return ret;
 		old_pte = ptep_modify_prot_start(vma, addr, pte);
+		ret = pte_soft_dirty(old_pte);
 		ptent = pte_wrprotect(old_pte);
 		ptent = pte_clear_soft_dirty(ptent);
 		ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
 	} else if (is_swap_pte(ptent)) {
+		ret = pte_swp_soft_dirty(ptent);
 		ptent = pte_swp_clear_soft_dirty(ptent);
 		set_pte_at(vma->vm_mm, addr, pte, ptent);
 	}
+    return ret;
 }
 #else
-static inline void clear_soft_dirty(struct vm_area_struct *vma,
+static inline bool clear_soft_dirty(struct vm_area_struct *vma,
 		unsigned long addr, pte_t *pte)
 {
+    return false;
 }
 #endif
 
 #if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
-static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
+static inline bool clear_soft_dirty_pmd(struct vm_area_struct *vma,
 		unsigned long addr, pmd_t *pmdp)
 {
 	pmd_t old, pmd = *pmdp;
+	bool ret = false;
 
 	if (pmd_present(pmd)) {
 		/* See comment in change_huge_pmd() */
 		old = pmdp_invalidate(vma, addr, pmdp);
+
+		ret = pmd_soft_dirty(old);
+
 		if (pmd_dirty(old))
 			pmd = pmd_mkdirty(pmd);
 		if (pmd_young(old))
@@ -1651,14 +1662,17 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
 
 		set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
 	} else if (is_migration_entry(pmd_to_swp_entry(pmd))) {
+		ret = pmd_swp_soft_dirty(pmd);
 		pmd = pmd_swp_clear_soft_dirty(pmd);
 		set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
 	}
+    return ret;
 }
 #else
-static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
+static inline bool clear_soft_dirty_pmd(struct vm_area_struct *vma,
 		unsigned long addr, pmd_t *pmdp)
 {
+    return false;
 }
 #endif
 
@@ -1671,6 +1685,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
 	spinlock_t *ptl;
 	struct folio *folio;
 
+	BUG_ON(addr < cp->start || end > cp->end);
+
 	ptl = pmd_trans_huge_lock(pmd, vma);
 	if (ptl) {
 		if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
@@ -1728,9 +1744,11 @@ static int clear_refs_test_walk(unsigned long start, unsigned long end,
 	struct clear_refs_private *cp = walk->private;
 	struct vm_area_struct *vma = walk->vma;
 
-	if (vma->vm_flags & VM_PFNMAP)
+	if (!cp->clear_range && (vma->vm_flags & VM_PFNMAP))
 		return 1;
 
+	BUG_ON(start < cp->start || end > cp->end);
+
 	/*
 	 * Writing 1 to /proc/pid/clear_refs affects all pages.
 	 * Writing 2 to /proc/pid/clear_refs only affects anonymous pages.
@@ -1754,10 +1772,12 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 				size_t count, loff_t *ppos)
 {
 	struct task_struct *task;
-	char buffer[PROC_NUMBUF] = {};
+	char buffer[18] = {};
 	struct mm_struct *mm;
 	struct vm_area_struct *vma;
 	enum clear_refs_types type;
+	unsigned long start, end;
+	bool clear_range;
 	int itype;
 	int rv;
 
@@ -1765,12 +1785,34 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 		count = sizeof(buffer) - 1;
 	if (copy_from_user(buffer, buf, count))
 		return -EFAULT;
-	rv = kstrtoint(strstrip(buffer), 10, &itype);
-	if (rv < 0)
-		return rv;
-	type = (enum clear_refs_types)itype;
-	if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
-		return -EINVAL;
+
+	if (buffer[0] == '6')
+	{
+		static int once;
+
+		if (!once++)
+			printk(KERN_DEBUG "task_mmu: Using POC clear refs range implementation.\n");
+
+		if (count != 17)
+			return -EINVAL;
+
+		type = CLEAR_REFS_SOFT_DIRTY;
+		start = *(unsigned long *)(buffer + 1);
+		end = *(unsigned long *)(buffer + 1 + 8);
+	}
+	else
+	{
+		rv = kstrtoint(strstrip(buffer), 10, &itype);
+		if (rv < 0)
+			return rv;
+		type = (enum clear_refs_types)itype;
+
+		if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
+			return -EINVAL;
+
+		start = 0;
+		end = -1UL;
+	}
 
 	task = get_proc_task(file_inode(file));
 	if (!task)
@@ -1783,40 +1825,86 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 			.type = type,
 		};
 
-		if (mmap_write_lock_killable(mm)) {
-			count = -EINTR;
-			goto out_mm;
+		if (start || end != -1UL)
+		{
+			start = min(start, -1UL) & PAGE_MASK;
+			end = min(end, -1UL) & PAGE_MASK;
+
+			if (start >= end)
+			{
+				count = -EINVAL;
+				goto out_mm;
+			}
+			clear_range = true;
 		}
+		else
+		{
+			clear_range = false;
+		}
+
+		cp.start = start;
+		cp.end = end;
+		cp.clear_range = clear_range;
+
 		if (type == CLEAR_REFS_MM_HIWATER_RSS) {
+			if (mmap_write_lock_killable(mm)) {
+				count = -EINTR;
+				goto out_mm;
+			}
+
 			/*
 			 * Writing 5 to /proc/pid/clear_refs resets the peak
 			 * resident set size to this mm's current rss value.
 			 */
 			reset_mm_hiwater_rss(mm);
-			goto out_unlock;
+			mmap_write_unlock(mm);
+			goto out_mm;
 		}
 
 		if (type == CLEAR_REFS_SOFT_DIRTY) {
-			for_each_vma(vmi, vma) {
-				if (!(vma->vm_flags & VM_SOFTDIRTY))
-					continue;
-				vm_flags_clear(vma, VM_SOFTDIRTY);
-				vma_set_page_prot(vma);
+			if (mmap_read_lock_killable(mm)) {
+				count = -EINTR;
+				goto out_mm;
 			}
-
+			if (!clear_range)
+				for_each_vma(vmi, vma) {
+				    if (!(vma->vm_flags & VM_SOFTDIRTY))
+					    continue;
+				    mmap_read_unlock(mm);
+				    if (mmap_write_lock_killable(mm)) {
+					    count = -EINTR;
+					    goto out_mm;
+				    }
+					for_each_vma(vmi, vma) {
+						vm_flags_clear(vma, VM_SOFTDIRTY);
+					    vma_set_page_prot(vma);
+				    }
+				    mmap_write_downgrade(mm);
+				    break;
+				}
 			inc_tlb_flush_pending(mm);
 			mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY,
-						0, mm, 0, -1UL);
+						0, mm, start, end);
 			mmu_notifier_invalidate_range_start(&range);
 		}
-		walk_page_range(mm, 0, -1, &clear_refs_walk_ops, &cp);
+		else
+		{
+			if (mmap_write_lock_killable(mm)) {
+				count = -EINTR;
+				goto out_mm;
+			}
+		}
+		walk_page_range(mm, start, end == -1UL ? -1 : end, &clear_refs_walk_ops, &cp);
 		if (type == CLEAR_REFS_SOFT_DIRTY) {
 			mmu_notifier_invalidate_range_end(&range);
 			flush_tlb_mm(mm);
 			dec_tlb_flush_pending(mm);
+			mmap_read_unlock(mm);
+		}
+		else
+		{
+			mmap_write_unlock(mm);
 		}
-out_unlock:
-		mmap_write_unlock(mm);
 out_mm:
 		mmput(mm);
 	}
@@ -1838,6 +1926,7 @@ struct pagemapread {
 	int pos, len;		/* units: PM_ENTRY_BYTES, not bytes */
 	pagemap_entry_t *buffer;
 	bool show_pfn;
+	bool reset;
 };
 
 #define PAGEMAP_WALK_SIZE	(PMD_SIZE)
@@ -1848,6 +1937,7 @@ struct pagemapread {
 #define PM_PFRAME_MASK		GENMASK_ULL(PM_PFRAME_BITS - 1, 0)
 #define PM_SOFT_DIRTY		BIT_ULL(55)
 #define PM_MMAP_EXCLUSIVE	BIT_ULL(56)
+#define PM_SOFT_DIRTY_PAGE	BIT_ULL(57)
 #define PM_UFFD_WP		BIT_ULL(57)
 #define PM_GUARD_REGION		BIT_ULL(58)
 #define PM_FILE			BIT_ULL(61)
@@ -1869,6 +1959,14 @@ static int add_to_pagemap(pagemap_entry_t *pme, struct pagemapread *pm)
 	return 0;
 }
 
+static int add_addr_to_pagemap(unsigned long addr, struct pagemapread *pm)
+{
+	((unsigned long *)pm->buffer)[pm->pos++] = addr;
+	if (pm->pos >= pm->len)
+		return PM_END_OF_BUFFER;
+	return 0;
+}
+
 static bool __folio_page_mapped_exclusively(struct folio *folio, struct page *page)
 {
 	if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
@@ -1883,6 +1981,9 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
 	unsigned long addr = start;
 	int err = 0;
 
+	if (pm->reset)
+		goto out;
+
 	while (addr < end) {
 		struct vm_area_struct *vma = find_vma(walk->mm, addr);
 		pagemap_entry_t pme = make_pme(0, 0);
@@ -1929,13 +2030,13 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
 		flags |= PM_PRESENT;
 		page = vm_normal_page(vma, addr, pte);
 		if (pte_soft_dirty(pte))
-			flags |= PM_SOFT_DIRTY;
+			flags |= PM_SOFT_DIRTY | PM_SOFT_DIRTY_PAGE;
 		if (pte_uffd_wp(pte))
 			flags |= PM_UFFD_WP;
 	} else if (is_swap_pte(pte)) {
 		swp_entry_t entry;
 		if (pte_swp_soft_dirty(pte))
-			flags |= PM_SOFT_DIRTY;
+			flags |= PM_SOFT_DIRTY | PM_SOFT_DIRTY_PAGE;
 		if (pte_swp_uffd_wp(pte))
 			flags |= PM_UFFD_WP;
 		entry = pte_to_swp_entry(pte);
@@ -1993,6 +2094,20 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
 		struct page *page = NULL;
 		struct folio *folio = NULL;
 
+		if (pm->reset)
+		{
+			if (clear_soft_dirty_pmd(vma, addr, pmdp))
+			{
+				for (; addr != end; addr += PAGE_SIZE)
+				{
+					err = add_addr_to_pagemap(addr, pm);
+					if (err)
+						break;
+				}
+			}
+			goto trans_huge_done;
+		}
+
 		if (vma->vm_flags & VM_SOFTDIRTY)
 			flags |= PM_SOFT_DIRTY;
 
@@ -2001,7 +2116,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
 
 			flags |= PM_PRESENT;
 			if (pmd_soft_dirty(pmd))
-				flags |= PM_SOFT_DIRTY;
+				flags |= PM_SOFT_DIRTY | PM_SOFT_DIRTY_PAGE;
 			if (pmd_uffd_wp(pmd))
 				flags |= PM_UFFD_WP;
 			if (pm->show_pfn)
@@ -2022,7 +2137,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
 			}
 			flags |= PM_SWAP;
 			if (pmd_swp_soft_dirty(pmd))
-				flags |= PM_SOFT_DIRTY;
+				flags |= PM_SOFT_DIRTY | PM_SOFT_DIRTY_PAGE;
 			if (pmd_swp_uffd_wp(pmd))
 				flags |= PM_UFFD_WP;
 			VM_BUG_ON(!is_pmd_migration_entry(pmd));
@@ -2055,6 +2170,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
 					frame += (1 << MAX_SWAPFILES_SHIFT);
 			}
 		}
+trans_huge_done:
 		spin_unlock(ptl);
 		return err;
 	}
@@ -2070,10 +2186,18 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
 		return err;
 	}
 	for (; addr < end; pte++, addr += PAGE_SIZE) {
-		pagemap_entry_t pme;
+		if (pm->reset)
+		{
+			if (clear_soft_dirty(vma, addr, pte))
+			    err = add_addr_to_pagemap(addr, pm);
+		}
+		else
+		{
+			pagemap_entry_t pme;
 
-		pme = pte_to_pagemap_entry(pm, vma, addr, ptep_get(pte));
-		err = add_to_pagemap(&pme, pm);
+			pme = pte_to_pagemap_entry(pm, vma, addr, ptep_get(pte));
+			err = add_to_pagemap(&pme, pm);
+		}
 		if (err)
 			break;
 	}
@@ -2177,8 +2301,8 @@ static const struct mm_walk_ops pagemap_ops = {
  * determine which areas of memory are actually mapped and llseek to
  * skip over unmapped regions.
  */
-static ssize_t pagemap_read(struct file *file, char __user *buf,
-			    size_t count, loff_t *ppos)
+static ssize_t do_pagemap_read(struct file *file, char __user *buf,
+			    size_t count, loff_t *ppos, bool reset)
 {
 	struct mm_struct *mm = file->private_data;
 	struct pagemapread pm;
@@ -2187,6 +2311,8 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 	unsigned long start_vaddr;
 	unsigned long end_vaddr;
 	int ret = 0, copied = 0;
+	struct mmu_notifier_range range;
+	size_t buffer_len;
 
 	if (!mm || !mmget_not_zero(mm))
 		goto out;
@@ -2202,19 +2328,38 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 
 	/* do not disclose physical addresses: attack vector */
 	pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN);
+	pm.reset = reset;
+
+	buffer_len = min(PAGEMAP_WALK_SIZE >> PAGE_SHIFT, count / PM_ENTRY_BYTES);
 
-	pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
-	pm.buffer = kmalloc_array(pm.len, PM_ENTRY_BYTES, GFP_KERNEL);
+	pm.buffer = kmalloc_array(buffer_len, PM_ENTRY_BYTES, GFP_KERNEL);
 	ret = -ENOMEM;
 	if (!pm.buffer)
 		goto out_mm;
 
 	src = *ppos;
 	svpfn = src / PM_ENTRY_BYTES;
-	end_vaddr = mm->task_size;
+
+	start_vaddr = svpfn << PAGE_SHIFT;
+
+	if (reset)
+	{
+		if (count < sizeof(end_vaddr))
+		{
+			ret = -EINVAL;
+			goto out_mm;
+		}
+		if (copy_from_user(&end_vaddr, buf, sizeof(end_vaddr)))
+			return -EFAULT;
+		end_vaddr = min(end_vaddr, mm->task_size);
+	}
+	else
+	{
+		end_vaddr = mm->task_size;
+		start_vaddr = end_vaddr;
+	}
 
 	/* watch out for wraparound */
-	start_vaddr = end_vaddr;
 	if (svpfn <= (ULONG_MAX >> PAGE_SHIFT)) {
 		unsigned long end;
 
@@ -2239,18 +2384,35 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 		unsigned long end;
 
 		pm.pos = 0;
-		end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK;
+		pm.len = min(buffer_len, count / PM_ENTRY_BYTES);
+
+		end = reset ? end_vaddr : (start_vaddr + (pm.len << PAGE_SHIFT));
 		/* overflow ? */
 		if (end < start_vaddr || end > end_vaddr)
 			end = end_vaddr;
+
 		ret = mmap_read_lock_killable(mm);
 		if (ret)
 			goto out_free;
+
+		if (reset)
+		{
+			inc_tlb_flush_pending(mm);
+			mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY,
+						0, mm, start_vaddr, end);
+			mmu_notifier_invalidate_range_start(&range);
+		}
 		ret = walk_page_range(mm, start_vaddr, end, &pagemap_ops, &pm);
+		if (reset)
+		{
+			mmu_notifier_invalidate_range_end(&range);
+			flush_tlb_mm(mm);
+			dec_tlb_flush_pending(mm);
+		}
 		mmap_read_unlock(mm);
-		start_vaddr = end;
 
 		len = min(count, PM_ENTRY_BYTES * pm.pos);
+		BUG_ON(ret && ret != PM_END_OF_BUFFER);
 		if (copy_to_user(buf, pm.buffer, len)) {
 			ret = -EFAULT;
 			goto out_free;
@@ -2258,6 +2420,8 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 		copied += len;
 		buf += len;
 		count -= len;
+
+		start_vaddr = reset && pm.pos == pm.len ? ((unsigned long *)pm.buffer)[pm.pos - 1] + PAGE_SIZE : end;
 	}
 	*ppos += copied;
 	if (!ret || ret == PM_END_OF_BUFFER)
@@ -2271,6 +2435,18 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 	return ret;
 }
 
+static ssize_t pagemap_read(struct file *file, char __user *buf,
+			    size_t count, loff_t *ppos)
+{
+	return do_pagemap_read(file, buf, count, ppos, false);
+}
+
+static ssize_t pagemap_reset_read(struct file *file, char __user *buf,
+			    size_t count, loff_t *ppos)
+{
+	return do_pagemap_read(file, buf, count, ppos, true);
+}
+
 static int pagemap_open(struct inode *inode, struct file *file)
 {
 	struct mm_struct *mm;
@@ -3065,6 +3241,14 @@ const struct file_operations proc_pagemap_operations = {
 	.unlocked_ioctl = do_pagemap_cmd,
 	.compat_ioctl	= do_pagemap_cmd,
 };
+
+const struct file_operations proc_pagemap_reset_operations = {
+	.llseek		= mem_lseek, /* borrow this */
+	.read		= pagemap_reset_read,
+	.open		= pagemap_open,
+	.release	= pagemap_release,
+};
+
 #endif /* CONFIG_PROC_PAGE_MONITOR */
 
 #ifdef CONFIG_NUMA
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 51b6484c049345..9fae896aae83f8 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -508,6 +508,9 @@ extern int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
 #ifdef CONFIG_IRQ_FORCED_THREADING
 # ifdef CONFIG_PREEMPT_RT
 #  define force_irqthreads()	(true)
+# elif defined(CONFIG_FORCE_IRQ_THREADING)
+DECLARE_STATIC_KEY_TRUE(force_irqthreads_key);
+#  define force_irqthreads()	(static_branch_likely(&force_irqthreads_key))
 # else
 DECLARE_STATIC_KEY_FALSE(force_irqthreads_key);
 #  define force_irqthreads()	(static_branch_unlikely(&force_irqthreads_key))
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7c79b3369b82c9..819daddd3a493b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -201,7 +201,7 @@ static inline void __mm_zero_struct_page(struct page *page)
  * that.
  */
 #define MAPCOUNT_ELF_CORE_MARGIN	(5)
-#define DEFAULT_MAX_MAP_COUNT	(USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN)
+#define DEFAULT_MAX_MAP_COUNT	(INT_MAX - MAPCOUNT_ELF_CORE_MARGIN)
 
 extern int sysctl_max_map_count;
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b469878de25c8a..5809513cf54b7f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -816,6 +816,32 @@ struct kmap_ctrl {
 #endif
 };
 
+#ifdef CONFIG_SCHED_BORE
+#define BORE_BC_TIMESTAMP_SHIFT 16
+
+struct bore_bc {
+	u64				timestamp:	48;
+	u64				penalty:	16;
+};
+
+struct bore_ctx {
+	struct bore_bc	subtree;
+	struct bore_bc	group;
+	u64				burst_time;
+	u16				prev_penalty;
+	u16				curr_penalty;
+	union {
+		u16			penalty;
+		struct {
+			u8		_;
+			u8		score;
+		};
+	};
+	bool			stop_update;
+	bool			futex_waiting;
+};
+#endif /* CONFIG_SCHED_BORE */
+
 struct task_struct {
 #ifdef CONFIG_THREAD_INFO_IN_TASK
 	/*
@@ -874,6 +900,9 @@ struct task_struct {
 #ifdef CONFIG_SCHED_CLASS_EXT
 	struct sched_ext_entity		scx;
 #endif
+#ifdef CONFIG_SCHED_BORE
+	struct bore_ctx			bore;
+#endif /* CONFIG_SCHED_BORE */
 	const struct sched_class	*sched_class;
 
 #ifdef CONFIG_SCHED_CORE
diff --git a/include/linux/sched/bore.h b/include/linux/sched/bore.h
new file mode 100644
index 00000000000000..79dd72b9aa38ff
--- /dev/null
+++ b/include/linux/sched/bore.h
@@ -0,0 +1,39 @@
+#ifndef _KERNEL_SCHED_BORE_H
+#define _KERNEL_SCHED_BORE_H
+
+#include <linux/sched.h>
+#include <linux/sched/cputime.h>
+#include <linux/atomic.h>
+#include <linux/list.h>
+#include <linux/rcupdate.h>
+
+#define SCHED_BORE_AUTHOR   "Masahito Suzuki"
+#define SCHED_BORE_PROGNAME "BORE CPU Scheduler modification"
+
+#define SCHED_BORE_VERSION  "6.5.9"
+
+extern u8   __read_mostly sched_bore;
+extern u8   __read_mostly sched_burst_inherit_type;
+extern u8   __read_mostly sched_burst_smoothness;
+extern u8   __read_mostly sched_burst_penalty_offset;
+extern uint __read_mostly sched_burst_penalty_scale;
+extern uint __read_mostly sched_burst_cache_lifetime;
+
+extern u8   effective_prio_bore(struct task_struct *p);
+extern void update_curr_bore(struct task_struct *p, u64 delta_exec);
+extern void restart_burst_bore(struct task_struct *p);
+extern void restart_burst_rescale_deadline_bore(struct task_struct *p);
+extern void task_fork_bore(struct task_struct *p, struct task_struct *parent,
+													u64 clone_flags, u64 now);
+extern void sched_init_bore(void);
+extern void reset_task_bore(struct task_struct *p);
+
+extern int  sched_bore_update_handler(const struct ctl_table *table,
+	int write, void __user *buffer, size_t *lenp, loff_t *ppos);
+extern int  sched_burst_inherit_type_update_handler(const struct ctl_table *table,
+	int write, void __user *buffer, size_t *lenp, loff_t *ppos);
+
+extern void reweight_entity(
+	struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight);
+
+#endif /* _KERNEL_SCHED_BORE_H */
diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h
index 7e2744ec89336a..87126e49fc3009 100644
--- a/include/uapi/linux/futex.h
+++ b/include/uapi/linux/futex.h
@@ -22,6 +22,7 @@
 #define FUTEX_WAIT_REQUEUE_PI	11
 #define FUTEX_CMP_REQUEUE_PI	12
 #define FUTEX_LOCK_PI2		13
+#define FUTEX_WAIT_MULTIPLE	31
 
 #define FUTEX_PRIVATE_FLAG	128
 #define FUTEX_CLOCK_REALTIME	256
@@ -100,6 +101,18 @@ struct futex_waitv {
 	__u32 __reserved;
 };
 
+/**
+ * struct futex_wait_block - Block of futexes to be waited for
+ * @uaddr:	User address of the futex
+ * @val:	Futex value expected by userspace
+ * @bitset:	Bitset for the optional bitmasked wakeup
+ */
+struct futex_wait_block {
+	__u32 __user *uaddr;
+	__u32 val;
+	__u32 bitset;
+};
+
 /*
  * Support for robust futexes: the kernel cleans up held futexes at
  * thread exit time.
diff --git a/init/Kconfig b/init/Kconfig
index cab3ad28ca49e7..f93c898c20f886 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -187,6 +187,37 @@ config THREAD_INFO_IN_TASK
 
 menu "General setup"
 
+config ZEN_INTERACTIVE
+	bool "Tune kernel for interactivity"
+	default y
+	help
+	  Tunes the kernel for responsiveness at the cost of throughput and power usage.
+
+	  --- Block Layer ----------------------------------------
+
+	    Default scheduler for SQ..: mq-deadline ->   bfq
+	    Default scheduler for MQ..:        none ->   kyber
+
+	  --- Virtual Memory Subsystem ---------------------------
+
+	    Background-reclaim hugepages...:   no   ->   yes
+	    Compact unevictable............:   yes  ->   no
+	    Watermark boost factor.........:   1.5  ->   0
+	    Swap-in readahead..............:   3    ->   0
+
+	  --- EEVDF CPU Scheduler --------------------------------
+
+	    Minimal granularity............:   0.7  ->   0.4  ms
+	    Migration cost.................:   0.5  ->   0.3  ms
+	    Bandwidth slice size...........:   5    ->   3    ms
+	    Task rebalancing threshold.....:  32    ->   8
+
+	  --- CPUFreq Settings -----------------------------------
+
+	    Ondemand sampling down factor..:   1    ->   5
+	    Ondemand default up threshold..:  80    ->  55
+	    Ondemand micro up threshold....:  95    ->  60
+
 config BROKEN
 	bool
 	help
@@ -1423,6 +1454,23 @@ config CHECKPOINT_RESTORE
 
 	  If unsure, say N here.
 
+config SCHED_BORE
+	bool "Burst-Oriented Response Enhancer"
+	default y
+	help
+	  In Desktop and Mobile computing, one might prefer interactive
+	  tasks to keep responsive no matter what they run in the background.
+
+	  Enabling this kernel feature modifies the scheduler to discriminate
+	  tasks by their burst time (runtime since it last went sleeping or
+	  yielding state) and prioritize those that run less bursty.
+	  Such tasks usually include window compositor, widgets backend,
+	  terminal emulator, video playback, games and so on.
+	  With a little impact to scheduling fairness, it may improve
+	  responsiveness especially under heavy background workload.
+
+	  If unsure, say Y here.
+
 config SCHED_AUTOGROUP
 	bool "Automatic process group scheduling"
 	select CGROUPS
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index ce1435cb08b1ec..9eee2005e25f07 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -57,3 +57,20 @@ config HZ
 
 config SCHED_HRTICK
 	def_bool HIGH_RES_TIMERS
+
+config MIN_BASE_SLICE_NS
+	int "Default value for min_base_slice_ns"
+	default 2000000
+	help
+	 The BORE Scheduler automatically calculates the optimal base
+	 slice for the configured HZ using the following equation:
+	 
+	 base_slice_ns =
+	 	1000000000/HZ * DIV_ROUNDUP(min_base_slice_ns, 1000000000/HZ)
+	 
+	 This option sets the default lower bound limit of the base slice
+	 to prevent the loss of task throughput due to overscheduling.
+	 
+	 Setting this value too high can cause the system to boot with
+	 an unnecessarily large base slice, resulting in high scheduling
+	 latency and poor system responsiveness.
diff --git a/kernel/fork.c b/kernel/fork.c
index 3da0f08615a95e..a166224b5f8638 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -116,6 +116,10 @@
 /* For dup_mmap(). */
 #include "../mm/internal.h"
 
+#ifdef CONFIG_SCHED_BORE
+#include <linux/sched/bore.h>
+#endif /* CONFIG_SCHED_BORE */
+
 #include <trace/events/sched.h>
 
 #define CREATE_TRACE_POINTS
@@ -2325,6 +2329,10 @@ __latent_entropy struct task_struct *copy_process(
 	 * Need tasklist lock for parent etc handling!
 	 */
 	write_lock_irq(&tasklist_lock);
+#ifdef CONFIG_SCHED_BORE
+	if (likely(p->pid))
+		task_fork_bore(p, current, clone_flags, p->start_time);
+#endif /* CONFIG_SCHED_BORE */
 
 	/* CLONE_PARENT re-uses the old parent */
 	if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
index 880c9bf2f31504..fb3b617b9ed83d 100644
--- a/kernel/futex/syscalls.c
+++ b/kernel/futex/syscalls.c
@@ -166,6 +166,7 @@ static __always_inline bool futex_cmd_has_timeout(u32 cmd)
 	case FUTEX_LOCK_PI2:
 	case FUTEX_WAIT_BITSET:
 	case FUTEX_WAIT_REQUEUE_PI:
+	case FUTEX_WAIT_MULTIPLE:
 		return true;
 	}
 	return false;
@@ -178,13 +179,79 @@ futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t)
 		return -EINVAL;
 
 	*t = timespec64_to_ktime(*ts);
-	if (cmd == FUTEX_WAIT)
+	if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE)
 		*t = ktime_add_safe(ktime_get(), *t);
 	else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME))
 		*t = timens_ktime_to_host(CLOCK_MONOTONIC, *t);
 	return 0;
 }
 
+/**
+ * futex_read_wait_block - Read an array of futex_wait_block from userspace
+ * @uaddr:	Userspace address of the block
+ * @count:	Number of blocks to be read
+ *
+ * This function creates and allocate an array of futex_q (we zero it to
+ * initialize the fields) and then, for each futex_wait_block element from
+ * userspace, fill a futex_q element with proper values.
+ */
+inline struct futex_vector *futex_read_wait_block(u32 __user *uaddr, u32 count)
+{
+	unsigned int i;
+	struct futex_vector *futexv;
+	struct futex_wait_block fwb;
+	struct futex_wait_block __user *entry =
+		(struct futex_wait_block __user *)uaddr;
+
+	if (!count || count > FUTEX_WAITV_MAX)
+		return ERR_PTR(-EINVAL);
+
+	futexv = kcalloc(count, sizeof(*futexv), GFP_KERNEL);
+	if (!futexv)
+		return ERR_PTR(-ENOMEM);
+
+	for (i = 0; i < count; i++) {
+		if (copy_from_user(&fwb, &entry[i], sizeof(fwb))) {
+			kfree(futexv);
+			return ERR_PTR(-EFAULT);
+		}
+
+		futexv[i].w.flags = FUTEX_32;
+		futexv[i].w.val = fwb.val;
+		futexv[i].w.uaddr = (uintptr_t) (fwb.uaddr);
+		futexv[i].q = futex_q_init;
+	}
+
+	return futexv;
+}
+
+int futex_wait_multiple(struct futex_vector *vs, unsigned int count,
+			struct hrtimer_sleeper *to);
+
+int futex_opcode_31(ktime_t *abs_time, u32 __user *uaddr, int count)
+{
+	int ret;
+	struct futex_vector *vs;
+	struct hrtimer_sleeper *to = NULL, timeout;
+
+	to = futex_setup_timer(abs_time, &timeout, 0, 0);
+
+	vs = futex_read_wait_block(uaddr, count);
+
+	if (IS_ERR(vs))
+		return PTR_ERR(vs);
+
+	ret = futex_wait_multiple(vs, count, abs_time ? to : NULL);
+	kfree(vs);
+
+	if (to) {
+		hrtimer_cancel(&to->timer);
+		destroy_hrtimer_on_stack(&to->timer);
+	}
+
+	return ret;
+}
+
 SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
 		const struct __kernel_timespec __user *, utime,
 		u32 __user *, uaddr2, u32, val3)
@@ -204,6 +271,9 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
 		tp = &t;
 	}
 
+	if (cmd == FUTEX_WAIT_MULTIPLE)
+		return futex_opcode_31(tp, uaddr, val);
+
 	return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
 }
 
@@ -512,6 +582,9 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val,
 		tp = &t;
 	}
 
+	if (cmd == FUTEX_WAIT_MULTIPLE)
+		return futex_opcode_31(tp, uaddr, val);
+
 	return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
 }
 #endif /* CONFIG_COMPAT_32BIT_TIME */
diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c
index e2bbe5509ec27a..6484ad583f3bf7 100644
--- a/kernel/futex/waitwake.c
+++ b/kernel/futex/waitwake.c
@@ -4,6 +4,9 @@
 #include <linux/sched/task.h>
 #include <linux/sched/signal.h>
 #include <linux/freezer.h>
+#ifdef CONFIG_SCHED_BORE
+#include <linux/sched/bore.h>
+#endif /* CONFIG_SCHED_BORE */
 
 #include "futex.h"
 
@@ -355,7 +358,15 @@ void futex_do_wait(struct futex_q *q, struct hrtimer_sleeper *timeout)
 		 * is no timeout, or if it has yet to expire.
 		 */
 		if (!timeout || timeout->task)
+#ifdef CONFIG_SCHED_BORE
+		{
+			current->bore.futex_waiting = true;
+#endif /* CONFIG_SCHED_BORE */
 			schedule();
+#ifdef CONFIG_SCHED_BORE
+			current->bore.futex_waiting = false;
+		}
+#endif /* CONFIG_SCHED_BORE */
 	}
 	__set_current_state(TASK_RUNNING);
 }
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 1b4254d19a73ec..40d19d6826afdd 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -109,6 +109,23 @@ config GENERIC_IRQ_STAT_SNAPSHOT
 config IRQ_FORCED_THREADING
        bool
 
+config FORCE_IRQ_THREADING
+	bool "Make IRQ threading compulsory"
+	depends on IRQ_FORCED_THREADING
+	default n
+	help
+
+	  Make IRQ threading mandatory for any IRQ handlers that support it
+	  instead of being optional and requiring the threadirqs kernel
+	  parameter. Instead they can be optionally disabled with the
+	  nothreadirqs kernel parameter.
+
+	  Enabling this may make some architectures not boot with runqueue
+	  sharing and MuQSS.
+
+	  Enable if you are building for a desktop or low latency system,
+	  otherwise say N.
+
 config SPARSE_IRQ
 	bool "Support sparse irq numbering" if MAY_HAVE_SPARSE_IRQ
 	help
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 400856abf67219..56f4ccc02dac37 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -25,7 +25,18 @@
 #include "internals.h"
 
 #if defined(CONFIG_IRQ_FORCED_THREADING) && !defined(CONFIG_PREEMPT_RT)
+#ifdef CONFIG_FORCE_IRQ_THREADING
+DEFINE_STATIC_KEY_TRUE(force_irqthreads_key);
+#else
 DEFINE_STATIC_KEY_FALSE(force_irqthreads_key);
+#endif
+
+static int __init setup_noforced_irqthreads(char *arg)
+{
+	static_branch_disable(&force_irqthreads_key);
+	return 0;
+}
+early_param("nothreadirqs", setup_noforced_irqthreads);
 
 static int __init setup_forced_irqthreads(char *arg)
 {
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 8ae86371ddcddf..b688084bcecc75 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -37,3 +37,4 @@ obj-y += core.o
 obj-y += fair.o
 obj-y += build_policy.o
 obj-y += build_utility.o
+obj-$(CONFIG_SCHED_BORE) += bore.o
diff --git a/kernel/sched/bore.c b/kernel/sched/bore.c
new file mode 100644
index 00000000000000..7c1baa1bdf6c56
--- /dev/null
+++ b/kernel/sched/bore.c
@@ -0,0 +1,387 @@
+/*
+ *  Burst-Oriented Response Enhancer (BORE) CPU Scheduler
+ *  Copyright (C) 2021-2025 Masahito Suzuki <firelzrd@gmail.com>
+ */
+#include <linux/cpuset.h>
+#include <linux/sched/task.h>
+#include <linux/sched/bore.h>
+#include "sched.h"
+
+#ifdef CONFIG_SCHED_BORE
+u8   __read_mostly sched_bore                   = 1;
+u8   __read_mostly sched_burst_inherit_type     = 2;
+u8   __read_mostly sched_burst_smoothness       = 1;
+u8   __read_mostly sched_burst_penalty_offset   = 24;
+uint __read_mostly sched_burst_penalty_scale    = 1536;
+uint __read_mostly sched_burst_cache_lifetime   = 75000000;
+static int __maybe_unused maxval_prio    =   39;
+static int __maybe_unused maxval_6_bits  =   63;
+static int __maybe_unused maxval_8_bits  =  255;
+static int __maybe_unused maxval_12_bits = 4095;
+
+#define MAX_BURST_PENALTY ((40U << 8) - 1)
+#define BURST_CACHE_STOP_COUNT 63
+
+static u32 (*inherit_penalty_fn)(struct task_struct *, u64, u64);
+
+static inline u32 log2p1_u64_u32fp(u64 v, u8 fp) {
+	if (!v) return 0;
+	u32 exponent = fls64(v),
+		mantissa = (u32)(v << (64 - exponent) << 1 >> (64 - fp));
+	return exponent << fp | mantissa;
+}
+
+static inline u32 calc_burst_penalty(u64 burst_time) {
+	u32 greed = log2p1_u64_u32fp(burst_time, 8),
+		tolerance = sched_burst_penalty_offset << 8,
+		penalty = max(0, (s32)(greed - tolerance)),
+		scaled_penalty = penalty * sched_burst_penalty_scale >> 10;
+	return min(MAX_BURST_PENALTY, scaled_penalty);
+}
+
+static inline u64 rescale_slice(u64 delta, u8 old_prio, u8 new_prio) {
+	u64 unscaled, rescaled;
+	unscaled = mul_u64_u32_shr(delta   , sched_prio_to_weight[old_prio], 10);
+	rescaled = mul_u64_u32_shr(unscaled, sched_prio_to_wmult [new_prio], 22);
+	return rescaled;
+}
+
+static inline u32 binary_smooth(u32 new, u32 old) {
+	if (new <= old) return new;
+
+	u32 increment = new - old,
+		shift = sched_burst_smoothness,
+		divisor = 1U << shift;
+
+	return old + ((increment + divisor - 1) >> shift);
+}
+
+static void reweight_task_by_prio(struct task_struct *p, int prio) {
+	if (task_has_idle_policy(p)) return;
+
+	struct sched_entity *se = &p->se;
+	unsigned long weight = scale_load(sched_prio_to_weight[prio]);
+
+	if (se->on_rq) {
+		p->bore.stop_update = true;
+		reweight_entity(cfs_rq_of(se), se, weight);
+		p->bore.stop_update = false;
+	} else
+		se->load.weight = weight;
+	se->load.inv_weight = sched_prio_to_wmult[prio];
+}
+
+u8 effective_prio_bore(struct task_struct *p) {
+	int prio = p->static_prio - MAX_RT_PRIO;
+	if (likely(sched_bore))
+		prio += p->bore.score;
+	return (u8)clamp(prio, 0, maxval_prio);
+}
+
+static void update_penalty(struct task_struct *p) {
+	struct bore_ctx *ctx = &p->bore;
+
+	u8  prev_prio = effective_prio_bore(p);
+
+	ctx->penalty = (p->flags & PF_KTHREAD)? 0:
+		max(ctx->curr_penalty, ctx->prev_penalty);
+
+	u8 new_prio = effective_prio_bore(p);
+	if (new_prio != prev_prio)
+		reweight_task_by_prio(p, new_prio);
+}
+
+void update_curr_bore(struct task_struct *p, u64 delta_exec) {
+	struct bore_ctx *ctx = &p->bore;
+	if (ctx->stop_update) return;
+
+	ctx->burst_time += delta_exec;
+	u32 curr_penalty = ctx->curr_penalty = calc_burst_penalty(ctx->burst_time);
+
+	if (curr_penalty <= ctx->prev_penalty) return;
+	update_penalty(p);
+}
+
+void restart_burst_bore(struct task_struct *p) {
+	struct bore_ctx *ctx = &p->bore;
+	u32 new_penalty = binary_smooth(ctx->curr_penalty, ctx->prev_penalty);
+	ctx->prev_penalty = new_penalty;
+	ctx->curr_penalty = 0;
+	ctx->burst_time = 0;
+	update_penalty(p);
+}
+
+void restart_burst_rescale_deadline_bore(struct task_struct *p) {
+	struct sched_entity *se = &p->se;
+	s64 vscaled, vremain = se->deadline - se->vruntime;
+
+	u8 old_prio = effective_prio_bore(p);
+	restart_burst_bore(p);
+	u8 new_prio = effective_prio_bore(p);
+
+	if (old_prio > new_prio) {
+		vscaled = rescale_slice(abs(vremain), old_prio, new_prio);
+		if (unlikely(vremain < 0))
+			vscaled = -vscaled;
+		se->deadline = se->vruntime + vscaled;
+	}
+}
+
+static inline bool task_is_bore_eligible(struct task_struct *p)
+{return p && p->sched_class == &fair_sched_class && !p->exit_state;}
+
+#ifndef for_each_child_task
+#define for_each_child_task(p, t) \
+	list_for_each_entry(t, &(p)->children, sibling)
+#endif
+
+static inline u32 count_children_upto2(struct task_struct *p) {
+	struct list_head *head = &p->children;
+	struct list_head *next = head->next;
+	return (next != head) + (next->next != head);
+}
+
+static inline bool burst_cache_expired(struct bore_bc *bc, u64 now) {
+	u64 timestamp = bc->timestamp << BORE_BC_TIMESTAMP_SHIFT;
+	return now - timestamp > sched_burst_cache_lifetime;
+}
+
+static void update_burst_cache(struct bore_bc *bc,
+		struct task_struct *p, u32 count, u32 total, u64 now) {
+	u32 average = count ? total / count : 0;
+	bc->penalty = max(average, p->bore.penalty);
+	bc->timestamp = now >> BORE_BC_TIMESTAMP_SHIFT;
+}
+
+static u32 inherit_none(struct task_struct *parent,
+									u64 clone_flags, u64 now)
+{ return 0; }
+
+static u32 inherit_from_parent(struct task_struct *parent,
+									u64 clone_flags, u64 now) {
+	if (clone_flags & CLONE_PARENT)
+		parent = parent->real_parent;
+
+	struct bore_bc *bc = &parent->bore.subtree;
+
+	if (burst_cache_expired(bc, now)) {
+		struct task_struct *child;
+		u32 count = 0, total = 0;
+		for_each_child_task(parent, child) {
+			if (count >= BURST_CACHE_STOP_COUNT) break;
+
+			if (!task_is_bore_eligible(child)) continue;
+			count++;
+			total += child->bore.penalty;
+		}
+
+		update_burst_cache(bc, parent, count, total, now);
+	}
+
+	return bc->penalty;
+}
+
+static u32 inherit_from_ancestor_hub(struct task_struct *parent,
+										u64 clone_flags, u64 now) {
+	struct task_struct *ancestor = parent;
+	u32 sole_child_count = 0;
+
+	if (clone_flags & CLONE_PARENT) {
+		ancestor = ancestor->real_parent;
+		sole_child_count = 1;
+	}
+
+	for (struct task_struct *next;
+			(next = ancestor->real_parent) != ancestor &&
+			count_children_upto2(ancestor) <= sole_child_count;
+			ancestor = next, sole_child_count = 1) {}
+
+	struct bore_bc *bc = &ancestor->bore.subtree;
+
+	if (burst_cache_expired(bc, now)) {
+		struct task_struct *direct_child;
+		u32 count = 0, total = 0;
+		for_each_child_task(ancestor, direct_child) {
+			if (count >= BURST_CACHE_STOP_COUNT) break;
+
+			struct task_struct *descendant = direct_child;
+			while (count_children_upto2(descendant) == 1)
+				descendant = list_first_entry(&descendant->children,
+												struct task_struct, sibling);
+
+			if (!task_is_bore_eligible(descendant)) continue;
+			count++;
+			total += descendant->bore.penalty;
+		}
+
+		update_burst_cache(bc, ancestor, count, total, now);
+	}
+
+	return bc->penalty;
+}
+
+static u32 inherit_from_thread_group(struct task_struct *p, u64 now) {
+	struct task_struct *leader = p->group_leader;
+	struct bore_bc *bc = &leader->bore.group;
+
+	if (burst_cache_expired(bc, now)) {
+		struct task_struct *sibling;
+		u32 count = 0, total = 0;
+
+		for_each_thread(leader, sibling) {
+			if (count >= BURST_CACHE_STOP_COUNT) break;
+
+			if (!task_is_bore_eligible(sibling)) continue;
+			count++;
+			total += sibling->bore.penalty;
+		}
+
+		update_burst_cache(bc, leader, count, total, now);
+	}
+
+	return bc->penalty;
+}
+
+void task_fork_bore(struct task_struct *p,
+	               struct task_struct *parent, u64 clone_flags, u64 now) {
+	if (!task_is_bore_eligible(p) || unlikely(!sched_bore)) return;
+
+	struct bore_ctx *ctx = &p->bore;
+	u32 inherited_penalty = (clone_flags & CLONE_THREAD)?
+		inherit_from_thread_group(parent, now):
+		inherit_penalty_fn(parent, clone_flags, now);
+
+	if (ctx->prev_penalty < inherited_penalty)
+		ctx->prev_penalty = inherited_penalty;
+	ctx->curr_penalty  = 0;
+	ctx->burst_time    = 0;
+	ctx->stop_update   = false;
+	ctx->futex_waiting = false;
+	update_penalty(p);
+}
+
+void reset_task_bore(struct task_struct *p)
+{ memset(&p->bore, 0, sizeof(struct bore_ctx)); }
+
+static void update_inherit_type(void) {
+	switch(sched_burst_inherit_type) {
+	case 1:
+		inherit_penalty_fn = inherit_from_parent;
+		break;
+	case 2:
+		inherit_penalty_fn = inherit_from_ancestor_hub;
+		break;
+	default:
+		inherit_penalty_fn = inherit_none;
+	}
+}
+
+void __init sched_init_bore(void) {
+	printk(KERN_INFO "%s %s by %s\n",
+		SCHED_BORE_PROGNAME, SCHED_BORE_VERSION, SCHED_BORE_AUTHOR);
+
+	reset_task_bore(&init_task);
+	update_inherit_type();
+}
+
+static void readjust_all_task_weights(void) {
+	struct task_struct *task;
+	struct rq *rq;
+	struct rq_flags rf;
+
+	scoped_guard(write_lock_irq, &tasklist_lock)
+	for_each_process(task) {
+		if (!task_is_bore_eligible(task)) continue;
+		rq = task_rq_lock(task, &rf);
+		update_rq_clock(rq);
+		reweight_task_by_prio(task, effective_prio_bore(task));
+		task_rq_unlock(rq, task, &rf);
+	}
+}
+
+int sched_bore_update_handler(const struct ctl_table *table,
+		int write, void __user *buffer, size_t *lenp, loff_t *ppos) {
+	int ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos);
+	if (ret || !write)
+		return ret;
+
+	readjust_all_task_weights();
+
+	return 0;
+}
+
+int sched_burst_inherit_type_update_handler(const struct ctl_table *table,
+		int write, void __user *buffer, size_t *lenp, loff_t *ppos) {
+	int ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos);
+	if (ret || !write)
+		return ret;
+
+	update_inherit_type();
+
+	return 0;
+}
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table sched_bore_sysctls[] = {
+	{
+		.procname	= "sched_bore",
+		.data		= &sched_bore,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler = sched_bore_update_handler,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
+	{
+		.procname	= "sched_burst_inherit_type",
+		.data		= &sched_burst_inherit_type,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler = sched_burst_inherit_type_update_handler,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_TWO,
+	},
+	{
+		.procname	= "sched_burst_smoothness",
+		.data		= &sched_burst_smoothness,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler = proc_dou8vec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_THREE,
+	},
+	{
+		.procname	= "sched_burst_penalty_offset",
+		.data		= &sched_burst_penalty_offset,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler = proc_dou8vec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= &maxval_6_bits,
+	},
+	{
+		.procname	= "sched_burst_penalty_scale",
+		.data		= &sched_burst_penalty_scale,
+		.maxlen		= sizeof(uint),
+		.mode		= 0644,
+		.proc_handler = proc_douintvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= &maxval_12_bits,
+	},
+	{
+		.procname	= "sched_burst_cache_lifetime",
+		.data		= &sched_burst_cache_lifetime,
+		.maxlen		= sizeof(uint),
+		.mode		= 0644,
+		.proc_handler = proc_douintvec,
+	},
+};
+
+static int __init sched_bore_sysctl_init(void) {
+	register_sysctl_init("kernel", sched_bore_sysctls);
+	return 0;
+}
+late_initcall(sched_bore_sysctl_init);
+
+#endif // CONFIG_SYSCTL
+#endif /* CONFIG_SCHED_BORE */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f754a60de84849..c5a6f727fef143 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -100,6 +100,10 @@
 #include "../smpboot.h"
 #include "../locking/mutex.h"
 
+#ifdef CONFIG_SCHED_BORE
+#include <linux/sched/bore.h>
+#endif /* CONFIG_SCHED_BORE */
+
 EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpu);
 EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpumask);
 
@@ -1431,7 +1435,11 @@ int tg_nop(struct task_group *tg, void *data)
 
 void set_load_weight(struct task_struct *p, bool update_load)
 {
+#ifdef CONFIG_SCHED_BORE
+	int prio = effective_prio_bore(p);
+#else /* !CONFIG_SCHED_BORE */
 	int prio = p->static_prio - MAX_RT_PRIO;
+#endif /* CONFIG_SCHED_BORE */
 	struct load_weight lw;
 
 	if (task_has_idle_policy(p)) {
@@ -8655,6 +8663,10 @@ void __init sched_init(void)
 	BUG_ON(!sched_class_above(&ext_sched_class, &idle_sched_class));
 #endif
 
+#ifdef CONFIG_SCHED_BORE
+	sched_init_bore();
+#endif /* CONFIG_SCHED_BORE */
+
 	wait_bit_init();
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 02e16b70a7901e..751df396d94ba6 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -169,6 +169,53 @@ static const struct file_operations sched_feat_fops = {
 	.release	= single_release,
 };
 
+#ifdef CONFIG_SCHED_BORE
+#define DEFINE_SYSCTL_SCHED_FUNC(name, update_func) \
+static ssize_t sched_##name##_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) \
+{ \
+	char buf[16]; \
+	unsigned int value; \
+\
+	if (cnt > 15) \
+		cnt = 15; \
+\
+	if (copy_from_user(&buf, ubuf, cnt)) \
+		return -EFAULT; \
+	buf[cnt] = '\0'; \
+\
+	if (kstrtouint(buf, 10, &value)) \
+		return -EINVAL; \
+\
+	sysctl_sched_##name = value; \
+	sched_update_##update_func(); \
+\
+	*ppos += cnt; \
+	return cnt; \
+} \
+\
+static int sched_##name##_show(struct seq_file *m, void *v) \
+{ \
+	seq_printf(m, "%d\n", sysctl_sched_##name); \
+	return 0; \
+} \
+\
+static int sched_##name##_open(struct inode *inode, struct file *filp) \
+{ \
+	return single_open(filp, sched_##name##_show, NULL); \
+} \
+\
+static const struct file_operations sched_##name##_fops = { \
+	.open		= sched_##name##_open, \
+	.write		= sched_##name##_write, \
+	.read		= seq_read, \
+	.llseek		= seq_lseek, \
+	.release	= single_release, \
+};
+
+DEFINE_SYSCTL_SCHED_FUNC(min_base_slice, min_base_slice)
+
+#undef DEFINE_SYSCTL_SCHED_FUNC
+#else /* !CONFIG_SCHED_BORE */
 static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf,
 				   size_t cnt, loff_t *ppos)
 {
@@ -214,6 +261,7 @@ static const struct file_operations sched_scaling_fops = {
 	.llseek		= seq_lseek,
 	.release	= single_release,
 };
+#endif /* CONFIG_SCHED_BORE */
 
 #ifdef CONFIG_PREEMPT_DYNAMIC
 
@@ -500,12 +548,19 @@ static __init int sched_init_debug(void)
 	debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops);
 #endif
 
+#ifdef CONFIG_SCHED_BORE
+	debugfs_create_file("min_base_slice_ns", 0644, debugfs_sched, NULL, &sched_min_base_slice_fops);
+	debugfs_create_u32("base_slice_ns", 0444, debugfs_sched, &sysctl_sched_base_slice);
+#else /* !CONFIG_SCHED_BORE */
 	debugfs_create_u32("base_slice_ns", 0644, debugfs_sched, &sysctl_sched_base_slice);
+#endif /* CONFIG_SCHED_BORE */
 
 	debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms);
 	debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once);
 
+#if !defined(CONFIG_SCHED_BORE)
 	debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops);
+#endif /* CONFIG_SCHED_BORE */
 	debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost);
 	debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate);
 
@@ -747,6 +802,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 		SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)),
 		SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime)));
 
+#ifdef CONFIG_SCHED_BORE
+	SEQ_printf(m, " %2d", p->bore.score);
+#endif /* CONFIG_SCHED_BORE */
 #ifdef CONFIG_NUMA_BALANCING
 	SEQ_printf(m, "   %d      %d", task_node(p), task_numa_group_id(p));
 #endif
@@ -1224,6 +1282,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
 	__PS("nr_involuntary_switches", p->nivcsw);
 
 	P(se.load.weight);
+#ifdef CONFIG_SCHED_BORE
+	P(bore.score);
+#endif /* CONFIG_SCHED_BORE */
 	P(se.avg.load_sum);
 	P(se.avg.runnable_sum);
 	P(se.avg.util_sum);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5b752324270b08..66526308dcaf98 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -58,6 +58,10 @@
 #include "stats.h"
 #include "autogroup.h"
 
+#ifdef CONFIG_SCHED_BORE
+#include <linux/sched/bore.h>
+#endif /* CONFIG_SCHED_BORE */
+
 /*
  * The initial- and re-scaling of tunables is configurable
  *
@@ -67,20 +71,42 @@
  *   SCHED_TUNABLESCALING_LOG - scaled logarithmically, *1+ilog(ncpus)
  *   SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
  *
- * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
+ * BORE : default SCHED_TUNABLESCALING_NONE = *1 constant
+ * EEVDF: default SCHED_TUNABLESCALING_LOG  = *(1+ilog(ncpus))
  */
+#ifdef CONFIG_SCHED_BORE
+unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
+#else /* !CONFIG_SCHED_BORE */
 unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
+#endif /* CONFIG_SCHED_BORE */
 
 /*
  * Minimal preemption granularity for CPU-bound tasks:
  *
+ * BORE : base_slice = minimum multiple of nsecs_per_tick >= min_base_slice
  * (default: 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds)
  */
+#ifdef CONFIG_SCHED_BORE
+
+static const unsigned int nsecs_per_tick                = 1000000000ULL / HZ;
+unsigned int sysctl_sched_min_base_slice                = CONFIG_MIN_BASE_SLICE_NS;
+__read_mostly uint sysctl_sched_base_slice              = nsecs_per_tick;
+__read_mostly unsigned int sysctl_sched_migration_cost	= 500000UL;
+
+#elifdef CONFIG_ZEN_INTERACTIVE
+
+unsigned int sysctl_sched_base_slice			= 400000ULL;
+static unsigned int normalized_sysctl_sched_base_slice	= 400000ULL;
+__read_mostly unsigned int sysctl_sched_migration_cost	= 300000UL;
+
+#else
+
 unsigned int sysctl_sched_base_slice			= 700000ULL;
 static unsigned int normalized_sysctl_sched_base_slice	= 700000ULL;
-
 __read_mostly unsigned int sysctl_sched_migration_cost	= 500000UL;
 
+#endif /* CONFIG_SCHED_BORE */
+
 static int __init setup_sched_thermal_decay_shift(char *str)
 {
 	pr_warn("Ignoring the deprecated sched_thermal_decay_shift= option\n");
@@ -122,8 +148,12 @@ int __weak arch_asym_cpu_priority(int cpu)
  *
  * (default: 5 msec, units: microseconds)
  */
+#ifdef CONFIG_ZEN_INTERACTIVE
+static unsigned int sysctl_sched_cfs_bandwidth_slice		= 3000UL;
+#else
 static unsigned int sysctl_sched_cfs_bandwidth_slice		= 5000UL;
 #endif
+#endif
 
 #ifdef CONFIG_NUMA_BALANCING
 /* Restrict the NUMA promotion throughput (MB/s) for each target node. */
@@ -189,6 +219,13 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w)
  *
  * This idea comes from the SD scheduler of Con Kolivas:
  */
+#ifdef CONFIG_SCHED_BORE
+static void update_sysctl(void) {
+	sysctl_sched_base_slice = nsecs_per_tick *
+		max(1UL, DIV_ROUND_UP(sysctl_sched_min_base_slice, nsecs_per_tick));
+}
+void sched_update_min_base_slice(void) { update_sysctl(); }
+#else /* !CONFIG_SCHED_BORE */
 static unsigned int get_update_sysctl_factor(void)
 {
 	unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
@@ -219,6 +256,7 @@ static void update_sysctl(void)
 	SET_SYSCTL(sched_base_slice);
 #undef SET_SYSCTL
 }
+#endif /* CONFIG_SCHED_BORE */
 
 void __init sched_init_granularity(void)
 {
@@ -698,6 +736,9 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 	vlag = avg_vruntime(cfs_rq) - se->vruntime;
 	limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
+#ifdef CONFIG_SCHED_BORE
+	limit >>= !!sched_bore;
+#endif /* CONFIG_SCHED_BORE */
 
 	se->vlag = clamp(vlag, -limit, limit);
 }
@@ -891,10 +932,17 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
  */
 static inline void set_protect_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
+#ifdef CONFIG_SCHED_BORE
+	u64 slice = sysctl_sched_base_slice;
+	bool run_to_parity = likely(sched_bore) ?
+		sched_feat(RUN_TO_PARITY_BORE) : sched_feat(RUN_TO_PARITY);
+#else /* CONFIG_SCHED_BORE */
 	u64 slice = normalized_sysctl_sched_base_slice;
+	bool run_to_parity = sched_feat(RUN_TO_PARITY);
+#endif /* CONFIG_SCHED_BORE */
 	u64 vprot = se->deadline;
 
-	if (sched_feat(RUN_TO_PARITY))
+	if (run_to_parity)
 		slice = cfs_rq_min_slice(cfs_rq);
 
 	slice = min(slice, se->slice);
@@ -959,6 +1007,11 @@ static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect)
 		curr = NULL;
 
 	if (curr && protect && protect_slice(curr))
+#ifdef CONFIG_SCHED_BORE
+		if (!entity_is_task(curr) ||
+			!task_of(curr)->bore.futex_waiting ||
+			unlikely(!sched_bore))
+#endif /* CONFIG_SCHED_BORE */
 		return curr;
 
 	/* Pick the leftmost entity if it's eligible */
@@ -1020,6 +1073,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 /**************************************************************
  * Scheduling class statistics methods:
  */
+#if !defined(CONFIG_SCHED_BORE)
 int sched_update_scaling(void)
 {
 	unsigned int factor = get_update_sysctl_factor();
@@ -1031,6 +1085,7 @@ int sched_update_scaling(void)
 
 	return 0;
 }
+#endif /* CONFIG_SCHED_BORE */
 
 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
 
@@ -1229,6 +1284,11 @@ static void update_curr(struct cfs_rq *cfs_rq)
 	update_min_vruntime(cfs_rq);
 
 	if (entity_is_task(curr)) {
+#ifdef CONFIG_SCHED_BORE
+		struct task_struct *p = task_of(curr);
+		update_curr_bore(p, delta_exec);
+#endif /* CONFIG_SCHED_BORE */
+
 		/*
 		 * If the fair_server is active, we need to account for the
 		 * fair_server time whether or not the task is running on
@@ -3767,7 +3827,7 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags);
 
-static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 			    unsigned long weight)
 {
 	bool curr = cfs_rq->curr == se;
@@ -5124,12 +5184,11 @@ void __setparam_fair(struct task_struct *p, const struct sched_attr *attr)
 static void
 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
-	u64 vslice, vruntime = avg_vruntime(cfs_rq);
+	u64 vslice = 0, vruntime = avg_vruntime(cfs_rq);
 	s64 lag = 0;
 
 	if (!se->custom_slice)
 		se->slice = sysctl_sched_base_slice;
-	vslice = calc_delta_fair(se->slice, se);
 
 	/*
 	 * Due to how V is constructed as the weighted average of entities,
@@ -5214,7 +5273,18 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 		se->rel_deadline = 0;
 		return;
 	}
-
+#ifdef CONFIG_SCHED_BORE
+	if (entity_is_task(se) &&
+			likely(sched_bore) &&
+			task_of(se)->bore.futex_waiting)
+		goto vslice_found;
+#endif /* !CONFIG_SCHED_BORE */
+	vslice = calc_delta_fair(se->slice, se);
+#ifdef CONFIG_SCHED_BORE
+	if (likely(sched_bore))
+		vslice >>= !!(flags & (ENQUEUE_INITIAL | ENQUEUE_WAKEUP));
+	else
+#endif /* CONFIG_SCHED_BORE */
 	/*
 	 * When joining the competition; the existing tasks will be,
 	 * on average, halfway through their slice, as such start tasks
@@ -5223,6 +5293,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL))
 		vslice /= 2;
 
+#ifdef CONFIG_SCHED_BORE
+vslice_found:
+#endif /* CONFIG_SCHED_BORE */
 	/*
 	 * EEVDF: vd_i = ve_i + r_i/w_i
 	 */
@@ -5233,7 +5306,7 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
 
 static void
-requeue_delayed_entity(struct sched_entity *se);
+requeue_delayed_entity(struct sched_entity *se, int flags);
 
 static void
 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
@@ -5391,6 +5464,10 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 		if (sched_feat(DELAY_DEQUEUE) && delay &&
 		    !entity_eligible(cfs_rq, se)) {
 			update_load_avg(cfs_rq, se, 0);
+#ifdef CONFIG_SCHED_BORE
+			if (sched_feat(DELAY_ZERO) && likely(sched_bore))
+				update_entity_lag(cfs_rq, se);
+#endif /* CONFIG_SCHED_BORE */
 			set_delayed(se);
 			return false;
 		}
@@ -6879,7 +6956,7 @@ static int sched_idle_cpu(int cpu)
 }
 
 static void
-requeue_delayed_entity(struct sched_entity *se)
+requeue_delayed_entity(struct sched_entity *se, int flags)
 {
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
@@ -6892,13 +6969,22 @@ requeue_delayed_entity(struct sched_entity *se)
 	WARN_ON_ONCE(!se->on_rq);
 
 	if (sched_feat(DELAY_ZERO)) {
+#ifdef CONFIG_SCHED_BORE
+		if (likely(sched_bore))
+			flags |= ENQUEUE_WAKEUP;
+		else {
+#endif /* CONFIG_SCHED_BORE */
+		flags = 0;
 		update_entity_lag(cfs_rq, se);
+#ifdef CONFIG_SCHED_BORE
+		}
+#endif /* CONFIG_SCHED_BORE */
 		if (se->vlag > 0) {
 			cfs_rq->nr_queued--;
 			if (se != cfs_rq->curr)
 				__dequeue_entity(cfs_rq, se);
 			se->vlag = 0;
-			place_entity(cfs_rq, se, 0);
+			place_entity(cfs_rq, se, flags);
 			if (se != cfs_rq->curr)
 				__enqueue_entity(cfs_rq, se);
 			cfs_rq->nr_queued++;
@@ -6938,7 +7024,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		util_est_enqueue(&rq->cfs, p);
 
 	if (flags & ENQUEUE_DELAYED) {
-		requeue_delayed_entity(se);
+		requeue_delayed_entity(se, flags);
 		return;
 	}
 
@@ -6956,7 +7042,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	for_each_sched_entity(se) {
 		if (se->on_rq) {
 			if (se->sched_delayed)
-				requeue_delayed_entity(se);
+				requeue_delayed_entity(se, flags);
 			break;
 		}
 		cfs_rq = cfs_rq_of(se);
@@ -7169,6 +7255,15 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		util_est_dequeue(&rq->cfs, p);
 
 	util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
+#ifdef CONFIG_SCHED_BORE
+	struct cfs_rq *cfs_rq = &rq->cfs;
+	struct sched_entity *se = &p->se;
+	if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
+		if (cfs_rq->curr == se)
+			update_curr(cfs_rq_of(&p->se));
+		restart_burst_bore(p);
+	}
+#endif /* CONFIG_SCHED_BORE */
 	if (dequeue_entities(rq, &p->se, flags) < 0)
 		return false;
 
@@ -7536,9 +7631,14 @@ static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct tas
 	return new_cpu;
 }
 
+static inline bool is_idle_cpu(int cpu)
+{
+	return available_idle_cpu(cpu) || sched_idle_cpu(cpu);
+}
+
 static inline int __select_idle_cpu(int cpu, struct task_struct *p)
 {
-	if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) &&
+	if (is_idle_cpu(cpu) &&
 	    sched_cpu_cookie_match(cpu_rq(cpu), p))
 		return cpu;
 
@@ -7549,6 +7649,24 @@ static inline int __select_idle_cpu(int cpu, struct task_struct *p)
 DEFINE_STATIC_KEY_FALSE(sched_smt_present);
 EXPORT_SYMBOL_GPL(sched_smt_present);
 
+/*
+ * Return true if all the CPUs in the SMT core where @cpu belongs are idle,
+ * false otherwise.
+ */
+static bool is_idle_core(int cpu)
+{
+	int sibling;
+
+	if (!sched_smt_active())
+		return is_idle_cpu(cpu);
+
+	for_each_cpu(sibling, cpu_smt_mask(cpu))
+		if (!is_idle_cpu(sibling))
+			return false;
+
+	return true;
+}
+
 static inline void set_idle_cores(int cpu, int val)
 {
 	struct sched_domain_shared *sds;
@@ -7631,29 +7749,6 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
 	return -1;
 }
 
-/*
- * Scan the local SMT mask for idle CPUs.
- */
-static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
-{
-	int cpu;
-
-	for_each_cpu_and(cpu, cpu_smt_mask(target), p->cpus_ptr) {
-		if (cpu == target)
-			continue;
-		/*
-		 * Check if the CPU is in the LLC scheduling domain of @target.
-		 * Due to isolcpus, there is no guarantee that all the siblings are in the domain.
-		 */
-		if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
-			continue;
-		if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
-			return cpu;
-	}
-
-	return -1;
-}
-
 #else /* !CONFIG_SCHED_SMT: */
 
 static inline void set_idle_cores(int cpu, int val)
@@ -7670,9 +7765,9 @@ static inline int select_idle_core(struct task_struct *p, int core, struct cpuma
 	return __select_idle_cpu(core, p);
 }
 
-static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+static inline bool is_idle_core(int cpu)
 {
-	return -1;
+	return is_idle_cpu(cpu);
 }
 
 #endif /* !CONFIG_SCHED_SMT */
@@ -7769,7 +7864,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
 	for_each_cpu_wrap(cpu, cpus, target) {
 		unsigned long cpu_cap = capacity_of(cpu);
 
-		if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
+		if (!is_idle_cpu(cpu))
 			continue;
 
 		fits = util_fits_cpu(task_util, util_min, util_max, cpu);
@@ -7840,7 +7935,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	 */
 	lockdep_assert_irqs_disabled();
 
-	if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
+	if (is_idle_core(target) &&
 	    asym_fits_cpu(task_util, util_min, util_max, target))
 		return target;
 
@@ -7848,7 +7943,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	 * If the previous CPU is cache affine and idle, don't be stupid:
 	 */
 	if (prev != target && cpus_share_cache(prev, target) &&
-	    (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
+	    is_idle_core(prev) &&
 	    asym_fits_cpu(task_util, util_min, util_max, prev)) {
 
 		if (!static_branch_unlikely(&sched_cluster_active) ||
@@ -7880,7 +7975,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	if (recent_used_cpu != prev &&
 	    recent_used_cpu != target &&
 	    cpus_share_cache(recent_used_cpu, target) &&
-	    (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
+	    is_idle_core(recent_used_cpu) &&
 	    cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) &&
 	    asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
 
@@ -7916,16 +8011,9 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	if (!sd)
 		return target;
 
-	if (sched_smt_active()) {
+	if (sched_smt_active())
 		has_idle_core = test_idle_cores(target);
 
-		if (!has_idle_core && cpus_share_cache(prev, target)) {
-			i = select_idle_smt(p, sd, prev);
-			if ((unsigned int)i < nr_cpumask_bits)
-				return i;
-		}
-	}
-
 	i = select_idle_cpu(p, sd, has_idle_core, target);
 	if ((unsigned)i < nr_cpumask_bits)
 		return i;
@@ -8817,7 +8905,13 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
 	if (__pick_eevdf(cfs_rq, !do_preempt_short) == pse)
 		goto preempt;
 
+#ifdef CONFIG_SCHED_BORE
+	bool run_to_parity = likely(sched_bore) ?
+		sched_feat(RUN_TO_PARITY_BORE) : sched_feat(RUN_TO_PARITY);
+	if (run_to_parity && do_preempt_short)
+#else /* CONFIG_SCHED_BORE */
 	if (sched_feat(RUN_TO_PARITY) && do_preempt_short)
+#endif /* CONFIG_SCHED_BORE */
 		update_protect_slice(cfs_rq, se);
 
 	return;
@@ -8997,16 +9091,25 @@ static void yield_task_fair(struct rq *rq)
 	/*
 	 * Are we the only task in the tree?
 	 */
+#if !defined(CONFIG_SCHED_BORE)
 	if (unlikely(rq->nr_running == 1))
 		return;
 
 	clear_buddies(cfs_rq, se);
+#endif /* CONFIG_SCHED_BORE */
 
 	update_rq_clock(rq);
 	/*
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
+#ifdef CONFIG_SCHED_BORE
+	restart_burst_rescale_deadline_bore(curr);
+	if (unlikely(rq->nr_running == 1))
+		return;
+
+	clear_buddies(cfs_rq, se);
+#endif /* CONFIG_SCHED_BORE */
 	/*
 	 * Tell update_rq_clock() that we've just updated,
 	 * so we don't do microscopic update in schedule()
@@ -13256,6 +13359,9 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
 	WARN_ON_ONCE(p->se.sched_delayed);
 
 	attach_task_cfs_rq(p);
+#ifdef CONFIG_SCHED_BORE
+	reset_task_bore(p);
+#endif /* CONFIG_SCHED_BORE */
 
 	set_task_max_allowed_capacity(p);
 
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 3c12d9f93331d6..abadc5ca74e2dd 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -18,6 +18,9 @@ SCHED_FEAT(PLACE_REL_DEADLINE, true)
  * 0-lag point or until is has exhausted it's slice.
  */
 SCHED_FEAT(RUN_TO_PARITY, true)
+#ifdef CONFIG_SCHED_BORE
+SCHED_FEAT(RUN_TO_PARITY_BORE, false)
+#endif /* CONFIG_SCHED_BORE */
 /*
  * Allow wakeup of tasks with a shorter slice to cancel RUN_TO_PARITY for
  * current.
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index adfb6e3409d729..39732d2daf80ea 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2130,7 +2130,11 @@ extern int group_balance_cpu(struct sched_group *sg);
 extern void update_sched_domain_debugfs(void);
 extern void dirty_sched_domain_sysctl(int cpu);
 
+#ifdef CONFIG_SCHED_BORE
+extern void sched_update_min_base_slice(void);
+#else /* !CONFIG_SCHED_BORE */
 extern int sched_update_scaling(void);
+#endif /* CONFIG_SCHED_BORE */
 
 static inline const struct cpumask *task_user_cpus(struct task_struct *p)
 {
@@ -2800,7 +2804,7 @@ extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
 
 extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags);
 
-#ifdef CONFIG_PREEMPT_RT
+#if defined(CONFIG_PREEMPT_RT) || defined(CONFIG_ZEN_INTERACTIVE)
 # define SCHED_NR_MIGRATE_BREAK 8
 #else
 # define SCHED_NR_MIGRATE_BREAK 32
@@ -2809,7 +2813,12 @@ extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags);
 extern __read_mostly unsigned int sysctl_sched_nr_migrate;
 extern __read_mostly unsigned int sysctl_sched_migration_cost;
 
+#ifdef CONFIG_SCHED_BORE
+extern unsigned int sysctl_sched_min_base_slice;
+extern __read_mostly uint sysctl_sched_base_slice;
+#else /* !CONFIG_SCHED_BORE */
 extern unsigned int sysctl_sched_base_slice;
+#endif /* CONFIG_SCHED_BORE */
 
 extern int sysctl_resched_latency_warn_ms;
 extern int sysctl_resched_latency_warn_once;
diff --git a/mm/Kconfig b/mm/Kconfig
index ca3f146bc7053a..de643ce40c1108 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -631,7 +631,7 @@ config COMPACTION
 config COMPACT_UNEVICTABLE_DEFAULT
 	int
 	depends on COMPACTION
-	default 0 if PREEMPT_RT
+	default 0 if PREEMPT_RT || ZEN_INTERACTIVE
 	default 1
 
 #
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 6cba1cb14b23ac..43449e5b51aee8 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -63,7 +63,11 @@ unsigned long transparent_hugepage_flags __read_mostly =
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
 	(1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
 #endif
+#ifdef CONFIG_ZEN_INTERACTIVE
+	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG)|
+#else
 	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
+#endif
 	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
 	(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
 
diff --git a/mm/internal.h b/mm/internal.h
index 1561fc2ff5b832..4f1282289372bb 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -823,6 +823,7 @@ void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags);
 extern bool free_pages_prepare(struct page *page, unsigned int order);
 
 extern int user_min_free_kbytes;
+extern atomic_long_t kswapd_waiters;
 
 struct page *__alloc_frozen_pages_noprof(gfp_t, unsigned int order, int nid,
 		nodemask_t *);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ed82ee55e66aff..9407f8a22b06f3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -91,6 +91,8 @@ typedef int __bitwise fpi_t;
 /* Free the page without taking locks. Rely on trylock only. */
 #define FPI_TRYLOCK		((__force fpi_t)BIT(2))
 
+atomic_long_t kswapd_waiters = ATOMIC_LONG_INIT(0);
+
 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
 static DEFINE_MUTEX(pcp_batch_high_lock);
 #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
@@ -274,7 +276,11 @@ const char * const migratetype_names[MIGRATE_TYPES] = {
 
 int min_free_kbytes = 1024;
 int user_min_free_kbytes = -1;
+#ifdef CONFIG_ZEN_INTERACTIVE
+static int watermark_boost_factor __read_mostly;
+#else
 static int watermark_boost_factor __read_mostly = 15000;
+#endif
 static int watermark_scale_factor = 10;
 int defrag_mode;
 
@@ -4640,6 +4646,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	unsigned int cpuset_mems_cookie;
 	unsigned int zonelist_iter_cookie;
 	int reserve_flags;
+	bool woke_kswapd = false;
 
 	if (unlikely(nofail)) {
 		/*
@@ -4699,8 +4706,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 			goto nopage;
 	}
 
-	if (alloc_flags & ALLOC_KSWAPD)
+	if (alloc_flags & ALLOC_KSWAPD) {
+		if (!woke_kswapd) {
+			atomic_long_inc(&kswapd_waiters);
+			woke_kswapd = true;
+		}
 		wake_all_kswapds(order, gfp_mask, ac);
+	}
 
 	/*
 	 * The adjusted alloc_flags might result in immediate success, so try
@@ -4915,9 +4927,12 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 		goto retry;
 	}
 fail:
-	warn_alloc(gfp_mask, ac->nodemask,
-			"page allocation failure: order:%u", order);
 got_pg:
+	if (woke_kswapd)
+		atomic_long_dec(&kswapd_waiters);
+	if (!page)
+		warn_alloc(gfp_mask, ac->nodemask,
+				"page allocation failure: order:%u", order);
 	return page;
 }
 
diff --git a/mm/swap.c b/mm/swap.c
index 2260dcd2775e75..3a7aefc524e56a 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1101,6 +1101,10 @@ static const struct ctl_table swap_sysctl_table[] = {
  */
 void __init swap_setup(void)
 {
+#ifdef CONFIG_ZEN_INTERACTIVE
+	/* Only swap-in pages requested, avoid readahead */
+	page_cluster = 0;
+#else
 	unsigned long megs = PAGES_TO_MB(totalram_pages());
 
 	/* Use a smaller cluster for small-memory machines */
@@ -1112,6 +1116,7 @@ void __init swap_setup(void)
 	 * Right now other parts of the system means that we
 	 * _really_ don't want to cluster much more
 	 */
+#endif
 
 	register_sysctl_init("vm", swap_sysctl_table);
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b2fc8b626d3dff..3f0cfcf9cdf410 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -6484,7 +6484,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 	return 0;
 }
 
-static bool allow_direct_reclaim(pg_data_t *pgdat)
+static bool allow_direct_reclaim(pg_data_t *pgdat, bool using_kswapd)
 {
 	struct zone *zone;
 	unsigned long pfmemalloc_reserve = 0;
@@ -6509,6 +6509,10 @@ static bool allow_direct_reclaim(pg_data_t *pgdat)
 
 	wmark_ok = free_pages > pfmemalloc_reserve / 2;
 
+	/* The throttled direct reclaimer is now a kswapd waiter */
+	if (unlikely(!using_kswapd && !wmark_ok))
+		atomic_long_inc(&kswapd_waiters);
+
 	/* kswapd must be awake if processes are being throttled */
 	if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
 		if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL)
@@ -6574,7 +6578,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
 
 		/* Throttle based on the first usable node */
 		pgdat = zone->zone_pgdat;
-		if (allow_direct_reclaim(pgdat))
+		if (allow_direct_reclaim(pgdat, gfp_mask & __GFP_KSWAPD_RECLAIM))
 			goto out;
 		break;
 	}
@@ -6596,11 +6600,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
 	 */
 	if (!(gfp_mask & __GFP_FS))
 		wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
-			allow_direct_reclaim(pgdat), HZ);
+			allow_direct_reclaim(pgdat, true), HZ);
 	else
 		/* Throttle until kswapd wakes the process */
 		wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
-			allow_direct_reclaim(pgdat));
+			allow_direct_reclaim(pgdat, true));
+
+	if (unlikely(!(gfp_mask & __GFP_KSWAPD_RECLAIM)))
+		atomic_long_dec(&kswapd_waiters);
 
 	if (fatal_signal_pending(current))
 		return true;
@@ -7130,14 +7137,14 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
 		 * able to safely make forward progress. Wake them
 		 */
 		if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
-				allow_direct_reclaim(pgdat))
+				allow_direct_reclaim(pgdat, true))
 			wake_up_all(&pgdat->pfmemalloc_wait);
 
 		/* Check if kswapd should be suspending */
 		__fs_reclaim_release(_THIS_IP_);
 		ret = kthread_freezable_should_stop(&was_frozen);
 		__fs_reclaim_acquire(_THIS_IP_);
-		if (was_frozen || ret)
+		if (was_frozen || ret || !atomic_long_read(&kswapd_waiters))
 			break;
 
 		/*