diff --git a/Makefile b/Makefile index a082a1d7c7d9b4..22318c98072274 100644 --- a/Makefile +++ b/Makefile @@ -1067,11 +1067,6 @@ KBUILD_CFLAGS += -fno-strict-overflow # Make sure -fstack-check isn't enabled (like gentoo apparently did) KBUILD_CFLAGS += -fno-stack-check -# conserve stack if available -ifdef CONFIG_CC_IS_GCC -KBUILD_CFLAGS += -fconserve-stack -endif - # Ensure compilers do not transform certain loops into calls to wcslen() KBUILD_CFLAGS += -fno-builtin-wcslen diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 1a27efcf3c205a..2f06bd6847edd1 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -75,7 +75,7 @@ export BITS # # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383 # -KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx -mno-sse4a +KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx -mno-sse4a -mno-avx2 -fno-tree-vectorize KBUILD_RUSTFLAGS += --target=$(objtree)/scripts/target.json KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2 diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index b3ab80a03365cf..5e883b397ff3f2 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -26,6 +26,7 @@ struct pci_sysdata { #if IS_ENABLED(CONFIG_VMD) struct pci_dev *vmd_dev; /* VMD Device if in Intel VMD domain */ #endif + struct pci_dev *nvme_remap_dev; /* AHCI Device if NVME remapped bus */ }; extern int pci_routeirq; @@ -69,6 +70,11 @@ static inline bool is_vmd(struct pci_bus *bus) #define is_vmd(bus) false #endif /* CONFIG_VMD */ +static inline bool is_nvme_remap(struct pci_bus *bus) +{ + return to_pci_sysdata(bus)->nvme_remap_dev != NULL; +} + /* Can be used to override the logic in pci_scan_bus for skipping already-configured bus numbers - to be used for buggy BIOSes or architectures with incomplete PCI setup by the loader */ diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index ddb798603201ef..7c20387d82029a 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -723,12 +723,15 @@ int pci_ext_cfg_avail(void) return 0; } -#if IS_ENABLED(CONFIG_VMD) struct pci_dev *pci_real_dma_dev(struct pci_dev *dev) { +#if IS_ENABLED(CONFIG_VMD) if (is_vmd(dev->bus)) return to_pci_sysdata(dev->bus)->vmd_dev; +#endif + + if (is_nvme_remap(dev->bus)) + return to_pci_sysdata(dev->bus)->nvme_remap_dev; return dev; } -#endif diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 27f11320b8d12d..dcbcf7c8a3b225 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -16,6 +16,20 @@ config MQ_IOSCHED_KYBER synchronous writes, it will self-tune queue depths to achieve that goal. +config MQ_IOSCHED_ADIOS + tristate "Adaptive Deadline I/O scheduler" + default y + help + The Adaptive Deadline I/O Scheduler (ADIOS) is a multi-queue I/O + scheduler with learning-based adaptive latency control. + +config MQ_IOSCHED_DEFAULT_ADIOS + bool "Enable ADIOS I/O scheduler as default MQ I/O scheduler" + depends on MQ_IOSCHED_ADIOS=y + default y + help + Enable the ADIOS I/O scheduler as the default scheduler for MQ I/O. + config IOSCHED_BFQ tristate "BFQ I/O scheduler" select BLK_ICQ diff --git a/block/Makefile b/block/Makefile index c65f4da937026f..105b12fd86b842 100644 --- a/block/Makefile +++ b/block/Makefile @@ -22,6 +22,7 @@ obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o obj-$(CONFIG_BLK_CGROUP_IOCOST) += blk-iocost.o obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o +obj-$(CONFIG_MQ_IOSCHED_ADIOS) += adios.o bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o obj-$(CONFIG_IOSCHED_BFQ) += bfq.o @@ -36,3 +37,10 @@ obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += blk-crypto.o blk-crypto-profile.o \ blk-crypto-sysfs.o obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) += blk-crypto-fallback.o obj-$(CONFIG_BLOCK_HOLDER_DEPRECATED) += holder.o + +all: + make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules + +clean: + make -C /lib/modules/$(shell uname -r)/build M=$(PWD) clean + diff --git a/block/adios.c b/block/adios.c new file mode 100644 index 00000000000000..41de52785cd1ed --- /dev/null +++ b/block/adios.c @@ -0,0 +1,2006 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Adaptive Deadline I/O Scheduler (ADIOS) + * Copyright (C) 2025 Masahito Suzuki + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "elevator.h" +#include "blk.h" +#include "blk-mq.h" +#include "blk-mq-sched.h" + +#define ADIOS_VERSION "3.1.7" + +/* Request Types: + * + * Tier 0 (Highest Priority): Emergency & System Integrity Requests + * ----------------------------------------------------------------- + * - Target: Requests with the BLK_MQ_INSERT_AT_HEAD flag. + * - Purpose: For critical, non-negotiable operations such as device error + * recovery or flush sequences that must bypass all other scheduling logic. + * - Implementation: Placed in a dedicated, high-priority FIFO queue + * (`prio_queue[0]`) for immediate dispatch. + * + * Tier 1 (High Priority): I/O Barrier Guarantees + * --------------------------------------------------------------- + * - Target: Requests with the REQ_OP_FLUSH flag. + * - Purpose: To enforce a strict I/O barrier. When a flush request is + * received, the scheduler stops processing new requests from its main + * queues until all preceding requests have been completed. This guarantees + * the order of operations required by filesystems for data integrity. + * - Implementation: A state flag (ADIOS_STATE_BARRIER) halts + * insertion into the main deadline tree. The barrier request and all + * subsequent requests are held in a temporary `barrier_queue`. Once the + * main queues are drained, the barrier request and the subsequent requests + * are released from the pending queue back into the scheduler. + * + * Tier 2 (Medium Priority): Application Responsiveness + * ---------------------------------------------------- + * - Target: Normal synchronous requests (e.g., from standard file reads). + * - Purpose: To ensure correct application behavior for operations that + * depend on sequential I/O completion (e.g., file system mounts) and to + * provide low latency for interactive applications. + * - Implementation: The deadline for these requests is set to their start + * time (`rq->start_time_ns`). This effectively enforces FIFO-like behavior + * within the deadline-sorted red-black tree, preventing out-of-order + * execution of dependent synchronous operations. + * + * Tier 3 (Normal Priority): Background Throughput + * ----------------------------------------------- + * - Target: Asynchronous requests. + * - Purpose: To maximize disk throughput for background tasks where latency + * is not critical. + * - Implementation: These are the only requests where ADIOS's adaptive + * latency prediction model is used. A dynamic deadline is calculated based + * on the predicted I/O latency, allowing for aggressive reordering to + * optimize I/O efficiency. + * + * Dispatch Logic: + * The scheduler always dispatches requests in strict priority order: + * 1. prio_queue[0] (Tier 0) + * 2. The deadline-sorted batch queue (which naturally prioritizes Tier 2 + * over Tier 3 due to their calculated deadlines). + * 3. Barrier-pending requests are handled only after the main queues are empty. + */ + +// Global variable to control the latency +static u64 default_global_latency_window = 16000000ULL; +static u64 default_global_latency_window_rotational = 22000000ULL; +// Ratio below which batch queues should be refilled +static u8 default_bq_refill_below_ratio = 20; +// Maximum latency sample to input +static u64 default_lat_model_latency_limit = 500000000ULL; +// Batch ordering strategy +static u64 default_batch_order = 0; + +/* Compliance Flags: + * 0x1: Async requests will not be reordered based on the predicted latency + */ +enum adios_compliance_flags { + ADIOS_CF_FIXORDER = 1U << 0, +}; + +// Flags to control compliance with block layer constraints +static u64 default_compliance_flags = 0x0; + +// Dynamic thresholds for shrinkage +static u32 default_lm_shrink_at_kreqs = 5000; +static u32 default_lm_shrink_at_gbytes = 50; +static u32 default_lm_shrink_resist = 2; + +enum adios_optype { + ADIOS_READ = 0, + ADIOS_WRITE = 1, + ADIOS_DISCARD = 2, + ADIOS_OTHER = 3, + ADIOS_OPTYPES = 4, +}; + +// Latency targets for each operation type +static u64 default_latency_target[ADIOS_OPTYPES] = { + [ADIOS_READ] = 2ULL * NSEC_PER_MSEC, + [ADIOS_WRITE] = 2000ULL * NSEC_PER_MSEC, + [ADIOS_DISCARD] = 8000ULL * NSEC_PER_MSEC, + [ADIOS_OTHER] = 0ULL * NSEC_PER_MSEC, +}; + +// Maximum batch size limits for each operation type +static u32 default_batch_limit[ADIOS_OPTYPES] = { + [ADIOS_READ] = 36, + [ADIOS_WRITE] = 72, + [ADIOS_DISCARD] = 1, + [ADIOS_OTHER] = 1, +}; + +enum adios_batch_order { + ADIOS_BO_OPTYPE = 0, + ADIOS_BO_ELEVATOR = 1, +}; + +// Thresholds for latency model control +#define LM_BLOCK_SIZE_THRESHOLD 4096 +#define LM_SAMPLES_THRESHOLD 1024 +#define LM_INTERVAL_THRESHOLD 1500 +#define LM_OUTLIER_PERCENTILE 99 +#define LM_LAT_BUCKET_COUNT 64 + +#define ADIOS_PQ_LEVELS 2 +#define ADIOS_DL_TYPES 2 +#define ADIOS_BQ_PAGES 2 + +static u32 default_dl_prio[ADIOS_DL_TYPES] = {8, 0}; + +// Bit flags for the atomic state variable, indicating which queues have requests. +enum adios_state_flags { + ADIOS_STATE_PQ_0 = 1U << 0, + ADIOS_STATE_PQ_1 = 1U << 1, + ADIOS_STATE_DL_0 = 1U << 2, + ADIOS_STATE_DL_1 = 1U << 3, + ADIOS_STATE_BQ_PAGE_0 = 1U << 4, + ADIOS_STATE_BQ_PAGE_1 = 1U << 5, + ADIOS_STATE_BARRIER = 1U << 6, +}; +#define ADIOS_STATE_PQ 0 +#define ADIOS_STATE_DL 2 +#define ADIOS_STATE_BQ 4 +#define ADIOS_STATE_BP 6 + +// Temporal granularity of the deadline tree node (dl_group) +#define ADIOS_QUANTUM_SHIFT 20 + +#define ADIOS_MAX_INSERTS_PER_LOCK 72 +#define ADIOS_MAX_DELETES_PER_LOCK 24 + +// Structure to hold latency bucket data for small requests +struct latency_bucket_small { + u64 weighted_sum_latency; + u64 sum_of_weights; +}; + +// Structure to hold latency bucket data for large requests +struct latency_bucket_large { + u64 weighted_sum_latency; + u64 weighted_sum_block_size; + u64 sum_of_weights; +}; + +// Structure to hold per-cpu buckets, improving data locality and code clarity. +struct lm_buckets { + struct latency_bucket_small small_bucket[LM_LAT_BUCKET_COUNT]; + struct latency_bucket_large large_bucket[LM_LAT_BUCKET_COUNT]; +}; + +// Structure to hold RCU-protected latency model parameters +struct latency_model_params { + u64 base; + u64 slope; + u64 small_sum_delay; + u64 small_count; + u64 large_sum_delay; + u64 large_sum_bsize; + u64 last_update_jiffies; + struct rcu_head rcu; +}; + +// Structure to hold the latency model context data +struct latency_model { + spinlock_t update_lock; + struct latency_model_params __rcu *params; + + // Per-CPU buckets to avoid lock contention on the completion path + struct lm_buckets __percpu *pcpu_buckets; + + u32 lm_shrink_at_kreqs; + u32 lm_shrink_at_gbytes; + u8 lm_shrink_resist; +}; + +union adios_in_flight_rqs { + atomic64_t atomic; + u64 scalar; + struct { + u64 count: 16; + u64 total_pred_lat: 48; + }; +}; + +// Adios scheduler data +struct adios_data { + spinlock_t pq_lock; + struct list_head prio_queue[2]; + + struct rb_root_cached dl_tree[2]; + spinlock_t lock; + s64 dl_bias; + s32 dl_prio[2]; + + atomic_t state; + u8 bq_state[ADIOS_BQ_PAGES]; + + void (*insert_request_fn)(struct blk_mq_hw_ctx *, struct request *, + blk_insert_t, struct list_head *); + + u64 global_latency_window; + u64 compliance_flags; + u64 latency_target[ADIOS_OPTYPES]; + u32 batch_limit[ADIOS_OPTYPES]; + u32 batch_actual_max_size[ADIOS_OPTYPES]; + u32 batch_actual_max_total; + u32 async_depth; + u32 lat_model_latency_limit; + u8 bq_refill_below_ratio; + u8 is_rotational; + u8 batch_order; + u8 elv_direction; + sector_t head_pos; + sector_t last_completed_pos; + + bool bq_page; + struct list_head batch_queue[ADIOS_BQ_PAGES][ADIOS_OPTYPES]; + u32 batch_count[ADIOS_BQ_PAGES][ADIOS_OPTYPES]; + u8 bq_batch_order[ADIOS_BQ_PAGES]; + spinlock_t bq_lock; + spinlock_t barrier_lock; + struct list_head barrier_queue; + + struct lm_buckets *aggr_buckets; + + struct latency_model latency_model[ADIOS_OPTYPES]; + struct timer_list update_timer; + + union adios_in_flight_rqs in_flight_rqs; + atomic64_t total_pred_lat; + u64 last_completed_time; + + struct kmem_cache *rq_data_pool; + struct kmem_cache *dl_group_pool; + + struct request_queue *queue; +}; + +// List of requests with the same deadline in the deadline-sorted tree +struct dl_group { + struct rb_node node; + struct list_head rqs; + u64 deadline; +} __attribute__((aligned(64))); + +// Structure to hold scheduler-specific data for each request +struct adios_rq_data { + struct list_head *dl_group; + struct list_head dl_node; + + struct request *rq; + u64 deadline; + u64 pred_lat; + u32 block_size; + bool managed; +} __attribute__((aligned(64))); + +static const int adios_prio_to_wmult[40] = { + /* -20 */ 88761, 71755, 56483, 46273, 36291, + /* -15 */ 29154, 23254, 18705, 14949, 11916, + /* -10 */ 9548, 7620, 6100, 4904, 3906, + /* -5 */ 3121, 2501, 1991, 1586, 1277, + /* 0 */ 1024, 820, 655, 526, 423, + /* 5 */ 335, 272, 215, 172, 137, + /* 10 */ 110, 87, 70, 56, 45, + /* 15 */ 36, 29, 23, 18, 15, +}; + +static inline bool compliant(struct adios_data *ad, u32 flag) { + return ad->compliance_flags & flag; +} + +// Count the number of entries in aggregated small buckets +static u64 lm_count_small_entries(struct latency_bucket_small *buckets) { + u64 total_weight = 0; + for (u8 i = 0; i < LM_LAT_BUCKET_COUNT; i++) + total_weight += buckets[i].sum_of_weights; + return total_weight; +} + +// Update the small buckets in the latency model from aggregated data +static bool lm_update_small_buckets(struct latency_model *model, + struct latency_model_params *params, + struct latency_bucket_small *buckets, + u64 total_weight, bool count_all) { + u64 sum_latency = 0; + u64 sum_weight = 0; + u64 cumulative_weight = 0, threshold_weight = 0; + u8 outlier_threshold_bucket = 0; + u8 outlier_percentile = LM_OUTLIER_PERCENTILE; + u8 reduction; + + if (count_all) + outlier_percentile = 100; + + // Calculate the threshold weight for outlier detection + threshold_weight = (total_weight * outlier_percentile) / 100; + + // Identify the bucket that corresponds to the outlier threshold + for (u8 i = 0; i < LM_LAT_BUCKET_COUNT; i++) { + cumulative_weight += buckets[i].sum_of_weights; + if (cumulative_weight >= threshold_weight) { + outlier_threshold_bucket = i; + break; + } + } + + // Calculate the average latency, excluding outliers + for (u8 i = 0; i <= outlier_threshold_bucket; i++) { + struct latency_bucket_small *bucket = &buckets[i]; + if (i < outlier_threshold_bucket) { + sum_latency += bucket->weighted_sum_latency; + sum_weight += bucket->sum_of_weights; + } else { + // The threshold bucket's contribution is proportional + u64 remaining_weight = + threshold_weight - (cumulative_weight - bucket->sum_of_weights); + if (bucket->sum_of_weights > 0) { + sum_latency += div_u64(bucket->weighted_sum_latency * + remaining_weight, bucket->sum_of_weights); + sum_weight += remaining_weight; + } + } + } + + // Shrink the model if it reaches at the readjustment threshold + if (params->small_count >= 1000ULL * model->lm_shrink_at_kreqs) { + reduction = model->lm_shrink_resist; + if (params->small_count >> reduction) { + params->small_sum_delay -= params->small_sum_delay >> reduction; + params->small_count -= params->small_count >> reduction; + } + } + + if (!sum_weight) + return false; + + // Accumulate the average latency into the statistics + params->small_sum_delay += sum_latency; + params->small_count += sum_weight; + + return true; +} + +// Count the number of entries in aggregated large buckets +static u64 lm_count_large_entries(struct latency_bucket_large *buckets) { + u64 total_weight = 0; + for (u8 i = 0; i < LM_LAT_BUCKET_COUNT; i++) + total_weight += buckets[i].sum_of_weights; + return total_weight; +} + +// Update the large buckets in the latency model from aggregated data +static bool lm_update_large_buckets(struct latency_model *model, + struct latency_model_params *params, + struct latency_bucket_large *buckets, + u64 total_weight, bool count_all) { + s64 sum_latency = 0; + u64 sum_block_size = 0, intercept; + u64 cumulative_weight = 0, threshold_weight = 0; + u64 sum_weight = 0; + u8 outlier_threshold_bucket = 0; + u8 outlier_percentile = LM_OUTLIER_PERCENTILE; + u8 reduction; + + if (count_all) + outlier_percentile = 100; + + // Calculate the threshold weight for outlier detection + threshold_weight = (total_weight * outlier_percentile) / 100; + + // Identify the bucket that corresponds to the outlier threshold + for (u8 i = 0; i < LM_LAT_BUCKET_COUNT; i++) { + cumulative_weight += buckets[i].sum_of_weights; + if (cumulative_weight >= threshold_weight) { + outlier_threshold_bucket = i; + break; + } + } + + // Calculate the average latency and block size, excluding outliers + for (u8 i = 0; i <= outlier_threshold_bucket; i++) { + struct latency_bucket_large *bucket = &buckets[i]; + if (i < outlier_threshold_bucket) { + sum_latency += bucket->weighted_sum_latency; + sum_block_size += bucket->weighted_sum_block_size; + sum_weight += bucket->sum_of_weights; + } else { + // The threshold bucket's contribution is proportional + u64 remaining_weight = + threshold_weight - (cumulative_weight - bucket->sum_of_weights); + if (bucket->sum_of_weights > 0) { + sum_latency += div_u64(bucket->weighted_sum_latency * + remaining_weight, bucket->sum_of_weights); + sum_block_size += div_u64(bucket->weighted_sum_block_size * + remaining_weight, bucket->sum_of_weights); + sum_weight += remaining_weight; + } + } + } + + if (!sum_weight) + return false; + + // Shrink the model if it reaches at the readjustment threshold + if (params->large_sum_bsize >= 0x40000000ULL * model->lm_shrink_at_gbytes) { + reduction = model->lm_shrink_resist; + if (params->large_sum_bsize >> reduction) { + params->large_sum_delay -= params->large_sum_delay >> reduction; + params->large_sum_bsize -= params->large_sum_bsize >> reduction; + } + } + + // Accumulate the average delay into the statistics + intercept = params->base; + if (sum_latency > intercept) + sum_latency -= intercept; + + params->large_sum_delay += sum_latency; + params->large_sum_bsize += sum_block_size; + + return true; +} + +static void reset_buckets(struct lm_buckets *buckets) +{ memset(buckets, 0, sizeof(*buckets)); } + +static void lm_reset_pcpu_buckets(struct latency_model *model) { + int cpu; + for_each_possible_cpu(cpu) + reset_buckets(per_cpu_ptr(model->pcpu_buckets, cpu)); +} + +// Update the latency model parameters and statistics +static void latency_model_update( + struct adios_data *ad, struct latency_model *model) { + u64 now; + u64 small_weight, large_weight; + bool time_elapsed; + bool small_processed = false, large_processed = false; + struct lm_buckets *aggr = ad->aggr_buckets; + struct latency_bucket_small *asb; + struct latency_bucket_large *alb; + struct lm_buckets *pcpu_b; + unsigned long flags; + int cpu; + struct latency_model_params *old_params, *new_params; + + spin_lock_irqsave(&model->update_lock, flags); + + old_params = rcu_dereference_protected(model->params, + lockdep_is_held(&model->update_lock)); + new_params = kmemdup(old_params, sizeof(*new_params), GFP_ATOMIC); + if (!new_params) { + spin_unlock_irqrestore(&model->update_lock, flags); + return; + } + + // Aggregate data from all CPUs and reset per-cpu buckets. + for_each_possible_cpu(cpu) { + pcpu_b = per_cpu_ptr(model->pcpu_buckets, cpu); + + for (u8 i = 0; i < LM_LAT_BUCKET_COUNT; i++) { + if (pcpu_b->small_bucket[i].sum_of_weights) { + asb = &aggr->small_bucket[i]; + asb->sum_of_weights += + pcpu_b->small_bucket[i].sum_of_weights; + asb->weighted_sum_latency += + pcpu_b->small_bucket[i].weighted_sum_latency; + } + if (pcpu_b->large_bucket[i].sum_of_weights) { + alb = &aggr->large_bucket[i]; + alb->sum_of_weights += + pcpu_b->large_bucket[i].sum_of_weights; + alb->weighted_sum_latency += + pcpu_b->large_bucket[i].weighted_sum_latency; + alb->weighted_sum_block_size += + pcpu_b->large_bucket[i].weighted_sum_block_size; + } + } + // Reset per-cpu buckets after aggregating + reset_buckets(pcpu_b); + } + + // Count the number of entries in aggregated buckets + small_weight = lm_count_small_entries(aggr->small_bucket); + large_weight = lm_count_large_entries(aggr->large_bucket); + + // Whether enough time has elapsed since the last update + now = jiffies; + time_elapsed = unlikely(!new_params->base) || + new_params->last_update_jiffies + + msecs_to_jiffies(LM_INTERVAL_THRESHOLD) <= now; + + // Update small buckets + if (small_weight && (time_elapsed || + LM_SAMPLES_THRESHOLD <= small_weight || !new_params->base)) { + small_processed = lm_update_small_buckets(model, new_params, + aggr->small_bucket, small_weight, !new_params->base); + memset(&aggr->small_bucket[0], 0, sizeof(aggr->small_bucket)); + } + // Update large buckets + if (large_weight && (time_elapsed || + LM_SAMPLES_THRESHOLD <= large_weight || !new_params->slope)) { + large_processed = lm_update_large_buckets(model, new_params, + aggr->large_bucket, large_weight, !new_params->slope); + memset(&aggr->large_bucket[0], 0, sizeof(aggr->large_bucket)); + } + + // Update the base parameter if small bucket was processed + if (small_processed && likely(new_params->small_count)) + new_params->base = div_u64(new_params->small_sum_delay, + new_params->small_count); + + // Update the slope parameter if large bucket was processed + if (large_processed && likely(new_params->large_sum_bsize)) + new_params->slope = div_u64(new_params->large_sum_delay, + DIV_ROUND_UP_ULL(new_params->large_sum_bsize, 1024)); + + // Update last updated jiffies if update happened or time has elapsed + if (small_processed || large_processed || time_elapsed) + new_params->last_update_jiffies = now; + + rcu_assign_pointer(model->params, new_params); + spin_unlock_irqrestore(&model->update_lock, flags); + + kfree_rcu(old_params, rcu); +} + +// Determine the bucket index for a given measured and predicted latency +static u8 lm_input_bucket_index(u64 measured, u64 predicted) { + u8 bucket_index; + + if (measured < predicted * 2) + bucket_index = div_u64((measured * 20), predicted); + else if (measured < predicted * 5) + bucket_index = div_u64((measured * 10), predicted) + 20; + else + bucket_index = div_u64((measured * 3), predicted) + 40; + + return bucket_index; +} + +// Input latency data into the latency model +static void latency_model_input(struct adios_data *ad, + struct latency_model *model, + u32 block_size, u64 latency, u64 pred_lat, u32 weight) { + unsigned long flags; + u8 bucket_index; + struct lm_buckets *buckets; + u64 current_base; + struct latency_model_params *params; + + local_irq_save(flags); + buckets = per_cpu_ptr(model->pcpu_buckets, __smp_processor_id()); + + rcu_read_lock(); + params = rcu_dereference(model->params); + current_base = params->base; + rcu_read_unlock(); + + if (block_size <= LM_BLOCK_SIZE_THRESHOLD) { + // Handle small requests + bucket_index = lm_input_bucket_index(latency, current_base ?: 1); + + if (bucket_index >= LM_LAT_BUCKET_COUNT) + bucket_index = LM_LAT_BUCKET_COUNT - 1; + + buckets->small_bucket[bucket_index].sum_of_weights += weight; + buckets->small_bucket[bucket_index].weighted_sum_latency += + latency * weight; + + local_irq_restore(flags); + + if (unlikely(!current_base)) { + latency_model_update(ad, model); + return; + } + } else { + // Handle large requests + if (!current_base || !pred_lat) { + local_irq_restore(flags); + return; + } + + bucket_index = lm_input_bucket_index(latency, pred_lat); + + if (bucket_index >= LM_LAT_BUCKET_COUNT) + bucket_index = LM_LAT_BUCKET_COUNT - 1; + + buckets->large_bucket[bucket_index].sum_of_weights += weight; + buckets->large_bucket[bucket_index].weighted_sum_latency += + latency * weight; + buckets->large_bucket[bucket_index].weighted_sum_block_size += + block_size * weight; + + local_irq_restore(flags); + } +} + +// Predict the latency for a given block size using the latency model +static u64 latency_model_predict(struct latency_model *model, u32 block_size) { + u64 result; + struct latency_model_params *params; + + rcu_read_lock(); + params = rcu_dereference(model->params); + + result = params->base; + if (block_size > LM_BLOCK_SIZE_THRESHOLD) + result += params->slope * + DIV_ROUND_UP_ULL(block_size - LM_BLOCK_SIZE_THRESHOLD, 1024); + + rcu_read_unlock(); + + return result; +} + +// Determine the type of operation based on request flags +static u8 adios_optype(struct request *rq) { + switch (rq->cmd_flags & REQ_OP_MASK) { + case REQ_OP_READ: + return ADIOS_READ; + case REQ_OP_WRITE: + return ADIOS_WRITE; + case REQ_OP_DISCARD: + return ADIOS_DISCARD; + default: + return ADIOS_OTHER; + } +} + +static inline u8 adios_optype_not_read(struct request *rq) { + return (rq->cmd_flags & REQ_OP_MASK) != REQ_OP_READ; +} + +// Helper function to retrieve adios_rq_data from a request +static inline struct adios_rq_data *get_rq_data(struct request *rq) { + return rq->elv.priv[0]; +} + +static inline +void set_adios_state(struct adios_data *ad, u32 shift, u32 idx, bool flag) { + if (flag) + atomic_or(1U << (idx + shift), &ad->state); + else + atomic_andnot(1U << (idx + shift), &ad->state); +} + +static inline u32 get_adios_state(struct adios_data *ad) +{ return atomic_read(&ad->state); } + +static inline u32 eval_this_adios_state(u32 state, u32 shift) +{ return (state >> shift) & 0x3; } + +static inline u32 eval_adios_state(struct adios_data *ad, u32 shift) +{ return eval_this_adios_state(get_adios_state(ad), shift); } + +// Add a request to the deadline-sorted red-black tree +static void add_to_dl_tree( + struct adios_data *ad, bool dl_idx, struct request *rq) { + struct rb_root_cached *root = &ad->dl_tree[dl_idx]; + struct rb_node **link = &(root->rb_root.rb_node), *parent = NULL; + bool leftmost = true; + struct adios_rq_data *rd = get_rq_data(rq); + struct dl_group *dlg; + u64 deadline; + bool was_empty = RB_EMPTY_ROOT(&root->rb_root); + + /* Tier-2: Synchronous Requests + * - Needs to be FIFO within a same optype + * - Relaxed order between different optypes + * - basically needs to be processed in early time */ + rd->deadline = rq->start_time_ns; + + /* Tier-3: Aynchronous Requests + * - Can be reordered and delayed freely */ + if (!(rq->cmd_flags & REQ_SYNC)) { + rd->deadline += ad->latency_target[adios_optype(rq)]; + if (!compliant(ad, ADIOS_CF_FIXORDER)) + rd->deadline += rd->pred_lat; + } + + // Now quantize the deadline (-> dlg->deadline == RB-Tree key) + deadline = rd->deadline & ~((1ULL << ADIOS_QUANTUM_SHIFT) - 1); + + while (*link) { + dlg = rb_entry(*link, struct dl_group, node); + s64 diff = deadline - dlg->deadline; + + parent = *link; + if (diff < 0) { + link = &((*link)->rb_left); + } else if (diff > 0) { + link = &((*link)->rb_right); + leftmost = false; + } else { // diff == 0 + goto found; + } + } + + dlg = rb_entry_safe(parent, struct dl_group, node); + if (!dlg || dlg->deadline != deadline) { + dlg = kmem_cache_zalloc(ad->dl_group_pool, GFP_ATOMIC); + if (!dlg) + return; + dlg->deadline = deadline; + INIT_LIST_HEAD(&dlg->rqs); + rb_link_node(&dlg->node, parent, link); + rb_insert_color_cached(&dlg->node, root, leftmost); + } +found: + list_add_tail(&rd->dl_node, &dlg->rqs); + rd->dl_group = &dlg->rqs; + + if (was_empty) + set_adios_state(ad, ADIOS_STATE_DL, dl_idx, true); +} + +// Remove a request from the deadline-sorted red-black tree +static void del_from_dl_tree( + struct adios_data *ad, bool dl_idx, struct request *rq) { + struct rb_root_cached *root = &ad->dl_tree[dl_idx]; + struct adios_rq_data *rd = get_rq_data(rq); + struct dl_group *dlg = container_of(rd->dl_group, struct dl_group, rqs); + + list_del_init(&rd->dl_node); + if (list_empty(&dlg->rqs)) { + rb_erase_cached(&dlg->node, root); + kmem_cache_free(ad->dl_group_pool, dlg); + } + rd->dl_group = NULL; + + if (RB_EMPTY_ROOT(&ad->dl_tree[dl_idx].rb_root)) + set_adios_state(ad, ADIOS_STATE_DL, dl_idx, false); +} + +// Remove a request from the scheduler +static void remove_request(struct adios_data *ad, struct request *rq) { + bool dl_idx = adios_optype_not_read(rq); + struct request_queue *q = rq->q; + struct adios_rq_data *rd = get_rq_data(rq); + + list_del_init(&rq->queuelist); + + // We might not be on the rbtree, if we are doing an insert merge + if (rd->dl_group) + del_from_dl_tree(ad, dl_idx, rq); + + elv_rqhash_del(q, rq); + if (q->last_merge == rq) + q->last_merge = NULL; +} + +// Convert a queue depth to the corresponding word depth for shallow allocation +static int to_word_depth(struct blk_mq_hw_ctx *hctx, unsigned int qdepth) { + struct sbitmap_queue *bt = &hctx->sched_tags->bitmap_tags; + const unsigned int nrr = hctx->queue->nr_requests; + + return ((qdepth << bt->sb.shift) + nrr - 1) / nrr; +} + +// We limit the depth of request allocation for asynchronous and write requests +static void adios_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) { + struct adios_data *ad = data->q->elevator->elevator_data; + + // Do not throttle synchronous reads + if (op_is_sync(opf) && !op_is_write(opf)) + return; + + data->shallow_depth = to_word_depth(data->hctx, ad->async_depth); +} + +// The number of requests in the queue was notified from the block layer +static void adios_depth_updated(struct request_queue *q) { + struct adios_data *ad = q->elevator->elevator_data; + + ad->async_depth = q->nr_requests; + blk_mq_set_min_shallow_depth(q, 1); +} + +// Handle request merging after a merge operation +static void adios_request_merged(struct request_queue *q, struct request *req, + enum elv_merge type) { + bool dl_idx = adios_optype_not_read(req); + struct adios_data *ad = q->elevator->elevator_data; + + // Reposition request in the deadline-sorted tree + del_from_dl_tree(ad, dl_idx, req); + add_to_dl_tree(ad, dl_idx, req); +} + +// Handle merging of requests after one has been merged into another +static void adios_merged_requests(struct request_queue *q, struct request *req, + struct request *next) { + struct adios_data *ad = q->elevator->elevator_data; + + lockdep_assert_held(&ad->lock); + + // kill knowledge of next, this one is a goner + remove_request(ad, next); +} + +// Try to merge a bio into an existing rq before associating it with an rq +static bool adios_bio_merge(struct request_queue *q, struct bio *bio, + unsigned int nr_segs) { + unsigned long flags; + struct adios_data *ad = q->elevator->elevator_data; + struct request *free = NULL; + bool ret; + + if (eval_adios_state(ad, ADIOS_STATE_BP)) + return false; + + if (!spin_trylock_irqsave(&ad->lock, flags)) + return false; + + ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free); + spin_unlock_irqrestore(&ad->lock, flags); + + if (free) + blk_mq_free_request(free); + + return ret; +} + +static bool merge_or_insert_to_dl_tree(struct adios_data *ad, + struct request *rq, struct request_queue *q, struct list_head *free) { + if (blk_mq_sched_try_insert_merge(q, rq, free)) + return true; + + bool dl_idx = adios_optype_not_read(rq); + add_to_dl_tree(ad, dl_idx, rq); + + if (rq_mergeable(rq)) { + elv_rqhash_add(q, rq); + if (!q->last_merge) + q->last_merge = rq; + } + + return false; +} + +static void insert_to_prio_queue(struct adios_data *ad, + struct request *rq, bool pq_idx) { + struct adios_rq_data *rd = get_rq_data(rq); + + /* We're sure that rd->managed == true */ + union adios_in_flight_rqs ifr = { + .count = 1, + .total_pred_lat = rd->pred_lat, + }; + atomic64_add(ifr.scalar, &ad->in_flight_rqs.atomic); + + scoped_guard(spinlock_irqsave, &ad->pq_lock) { + bool was_empty = list_empty(&ad->prio_queue[pq_idx]); + list_add_tail(&rq->queuelist, &ad->prio_queue[pq_idx]); + if (was_empty) + set_adios_state(ad, ADIOS_STATE_PQ, pq_idx, true); + } +} + +// Insert a request into the scheduler (after Read & Write models stabilized) +static void insert_request_post_stability(struct blk_mq_hw_ctx *hctx, + struct request *rq, blk_insert_t insert_flags, struct list_head *free) { + struct request_queue *q = hctx->queue; + struct adios_data *ad = q->elevator->elevator_data; + struct adios_rq_data *rd = get_rq_data(rq); + u8 optype = adios_optype(rq); + bool rq_is_flush; + + rd->managed = true; + rd->block_size = blk_rq_bytes(rq); + rd->pred_lat = + latency_model_predict(&ad->latency_model[optype], rd->block_size); + + /* Tier-0: BLK_MQ_INSERT_AT_HEAD Requests */ + if (insert_flags & BLK_MQ_INSERT_AT_HEAD) { + insert_to_prio_queue(ad, rq, 0); + return; + } + + /* + * Strict Barrier Handling for REQ_OP_FLUSH: + * If a flush request arrives, or if the scheduler is already in a + * barrier-pending state, all subsequent requests are diverted to a + * separate barrier_queue. This ensures that no new requests are processed + * until all work preceding the barrier is complete. + */ + rq_is_flush = rq->cmd_flags & REQ_OP_FLUSH; + if (eval_adios_state(ad, ADIOS_STATE_BP) || rq_is_flush) { + scoped_guard(spinlock_irqsave, &ad->barrier_lock) { + if (rq_is_flush) + set_adios_state(ad, ADIOS_STATE_BP, 0, true); + list_add_tail(&rq->queuelist, &ad->barrier_queue); + } + return; + } + + if (merge_or_insert_to_dl_tree(ad, rq, q, free)) + return; +} + +// Insert a request into the scheduler (before Read & Write models stabilizes) +static void insert_request_pre_stability(struct blk_mq_hw_ctx *hctx, + struct request *rq, blk_insert_t insert_flags, struct list_head *free) { + struct adios_data *ad = hctx->queue->elevator->elevator_data; + struct adios_rq_data *rd = get_rq_data(rq); + u8 optype = adios_optype(rq); + u8 pq_idx = !(insert_flags & BLK_MQ_INSERT_AT_HEAD); + bool models_stable = false; + + rd->managed = true; + rd->block_size = blk_rq_bytes(rq); + rd->pred_lat = + latency_model_predict(&ad->latency_model[optype], rd->block_size); + + insert_to_prio_queue(ad, rq, pq_idx); + + rcu_read_lock(); + if (rcu_dereference(ad->latency_model[ADIOS_READ].params)->base > 0 && + rcu_dereference(ad->latency_model[ADIOS_WRITE].params)->base > 0) + models_stable = true; + rcu_read_unlock(); + + if (models_stable) + ad->insert_request_fn = insert_request_post_stability; +} + +// Insert multiple requests into the scheduler +static void adios_insert_requests(struct blk_mq_hw_ctx *hctx, + struct list_head *list, + blk_insert_t insert_flags) { + struct request_queue *q = hctx->queue; + struct adios_data *ad = q->elevator->elevator_data; + struct request *rq; + bool stop = false; + LIST_HEAD(free); + + do { + scoped_guard(spinlock_irqsave, &ad->lock) + for (int i = 0; i < ADIOS_MAX_INSERTS_PER_LOCK; i++) { + if (list_empty(list)) { + stop = true; + break; + } + rq = list_first_entry(list, struct request, queuelist); + list_del_init(&rq->queuelist); + ad->insert_request_fn(hctx, rq, insert_flags, &free); + }} while (!stop); + + blk_mq_free_requests(&free); +} + +// Prepare a request before it is inserted into the scheduler +static void adios_prepare_request(struct request *rq) { + struct adios_data *ad = rq->q->elevator->elevator_data; + struct adios_rq_data *rd = get_rq_data(rq); + + rq->elv.priv[0] = NULL; + + /* Allocate adios_rq_data from the memory pool */ + rd = kmem_cache_zalloc(ad->rq_data_pool, GFP_ATOMIC); + if (WARN(!rd, "adios_prepare_request: " + "Failed to allocate memory from rq_data_pool. rd is NULL\n")) + return; + + rd->rq = rq; + rq->elv.priv[0] = rd; +} + +static struct adios_rq_data *get_dl_first_rd(struct adios_data *ad, bool idx) { + struct rb_root_cached *root = &ad->dl_tree[idx]; + struct rb_node *first = rb_first_cached(root); + struct dl_group *dl_group = rb_entry(first, struct dl_group, node); + + return list_first_entry(&dl_group->rqs, struct adios_rq_data, dl_node); +} + +// Comparison function for sorting requests by block address +static int cmp_rq_pos(void *priv, + const struct list_head *a, const struct list_head *b) { + struct request *rq_a = list_entry(a, struct request, queuelist); + struct request *rq_b = list_entry(b, struct request, queuelist); + u64 pos_a = blk_rq_pos(rq_a); + u64 pos_b = blk_rq_pos(rq_b); + + return (int)(pos_a > pos_b) - (int)(pos_a < pos_b); +} + +#ifndef list_last_entry_or_null +#define list_last_entry_or_null(ptr, type, member) \ + (!list_empty(ptr) ? list_last_entry(ptr, type, member) : NULL) +#endif + +// Update the elevator direction +static void update_elv_direction(struct adios_data *ad) { + if (!ad->is_rotational) + return; + + bool page = ad->bq_page; + struct list_head *q = &ad->batch_queue[page][1]; + if (ad->bq_batch_order[page] < ADIOS_BO_ELEVATOR || list_empty(q)) { + ad->elv_direction = 0; + return; + } + + // Get first and last request positions in the queue + struct request *rq_a = list_first_entry(q, struct request, queuelist); + struct request *rq_b = list_last_entry (q, struct request, queuelist); + u64 pos_a = blk_rq_pos(rq_a); + u64 pos_b = blk_rq_pos(rq_b); + u64 avg_rq_pos = (pos_a + pos_b) >> 1; + + ad->elv_direction = !!(ad->head_pos > avg_rq_pos); +} + +// Fill the batch queues with requests from the deadline-sorted red-black tree +static bool fill_batch_queues(struct adios_data *ad, u64 tpl) { + struct adios_rq_data *rd; + struct request *rq; + struct list_head *dest_q; + u8 dest_idx; + u64 added_lat = 0; + u32 optype_count[ADIOS_OPTYPES] = {0}; + u32 count = 0; + u8 optype; + bool page = !ad->bq_page, dl_idx, bias_idx, update_bias; + u32 dl_queued; + u8 bq_batch_order; + bool stop = false; + + // Reset batch queue counts for the back page + memset(&ad->batch_count[page], 0, sizeof(ad->batch_count[page])); + + ad->bq_batch_order[page] = + bq_batch_order = ad->batch_order; + + do { + scoped_guard(spinlock_irqsave, &ad->lock) + for (int i = 0; i < ADIOS_MAX_DELETES_PER_LOCK; i++) { + bool has_base = false; + + dl_queued = eval_adios_state(ad, ADIOS_STATE_DL); + // Check if there are any requests queued in the deadline tree + if (!dl_queued) { + stop = true; + break; + } + + // Reads if both queues have requests, otherwise pick the non-empty. + dl_idx = dl_queued >> 1; + + // Get the first request from the deadline-sorted tree + rd = get_dl_first_rd(ad, dl_idx); + + bias_idx = ad->dl_bias < 0; + // If read and write requests are queued, choose one based on bias + if (dl_queued == 0x3) { + struct adios_rq_data *trd[2] = {get_dl_first_rd(ad, 0), rd}; + rd = trd[bias_idx]; + + update_bias = (trd[bias_idx]->deadline > trd[!bias_idx]->deadline); + } else + update_bias = (bias_idx == dl_idx); + + rq = rd->rq; + optype = adios_optype(rq); + + rcu_read_lock(); + has_base = + !!rcu_dereference(ad->latency_model[optype].params)->base; + rcu_read_unlock(); + + // Check batch size and total predicted latency + if (count && (!has_base || + ad->batch_count[page][optype] >= ad->batch_limit[optype] || + (tpl + added_lat + rd->pred_lat) > ad->global_latency_window)) { + stop = true; + break; + } + + if (update_bias) { + s64 sign = ((s64)bias_idx << 1) - 1; + if (unlikely(!rd->pred_lat)) + ad->dl_bias = sign; + else + // Adjust the bias based on the predicted latency + ad->dl_bias += sign * (s64)((rd->pred_lat * + adios_prio_to_wmult[ad->dl_prio[bias_idx] + 20]) >> 10); + } + + remove_request(ad, rq); + + // Add request to the corresponding batch queue + dest_idx = (bq_batch_order == ADIOS_BO_OPTYPE || optype == ADIOS_OTHER)? + optype : !!(rd->deadline != rq->start_time_ns); + dest_q = &ad->batch_queue[page][dest_idx]; + list_add_tail(&rq->queuelist, dest_q); + ad->bq_state[page] |= 1U << dest_idx; + ad->batch_count[page][optype]++; + optype_count[optype]++; + added_lat += rd->pred_lat; + count++; + }} while (!stop); + + if (bq_batch_order == ADIOS_BO_ELEVATOR && ad->batch_count[page][1] > 1) + list_sort(NULL, &ad->batch_queue[page][1], cmp_rq_pos); + + if (count) { + /* We're sure that every request's rd->managed == true */ + union adios_in_flight_rqs ifr = { + .count = count, + .total_pred_lat = added_lat, + }; + atomic64_add(ifr.scalar, &ad->in_flight_rqs.atomic); + + set_adios_state(ad, ADIOS_STATE_BQ, page, true); + + for (optype = 0; optype < ADIOS_OPTYPES; optype++) + if (ad->batch_actual_max_size[optype] < optype_count[optype]) + ad->batch_actual_max_size[optype] = optype_count[optype]; + if (ad->batch_actual_max_total < count) + ad->batch_actual_max_total = count; + } + return count; +} + +// Flip to the next batch queue page +static void flip_bq_page(struct adios_data *ad) { + ad->bq_page = !ad->bq_page; + update_elv_direction(ad); +} + +// Pop a request from the specified index (optype or elevator tier) +static inline struct request *pop_bq_request( + struct adios_data *ad, u8 idx, bool direction) { + bool page = ad->bq_page; + struct list_head *q = &ad->batch_queue[page][idx]; + struct request *rq = direction ? + list_last_entry_or_null (q, struct request, queuelist): + list_first_entry_or_null(q, struct request, queuelist); + if (rq) { + list_del_init(&rq->queuelist); + if (list_empty(q)) + ad->bq_state[page] &= ~(1U << idx); + } + return rq; +} + +static struct request *pop_next_bq_request_optype(struct adios_data *ad) { + u32 bq_state = ad->bq_state[ad->bq_page]; + if (!bq_state) return NULL; + + struct request *rq; + u32 bq_idx = __builtin_ctz(bq_state); + + // Dispatch based on optype (FIFO within each) or single-queue elevator + rq = pop_bq_request(ad, bq_idx, false); + return rq; +} + +static struct request *pop_next_bq_request_elevator(struct adios_data *ad) { + u32 bq_state = ad->bq_state[ad->bq_page]; + if (!bq_state) return NULL; + + struct request *rq; + u32 bq_idx = __builtin_ctz(bq_state); + bool direction = (bq_idx == 1) & ad->elv_direction; + + // Tier-2 (sync) is always high priority + // Tier-3 (async) uses the pre-calculated elevator direction + rq = pop_bq_request(ad, bq_idx, direction); + + /* If batch queue for the sync requests just became empty */ + if (bq_idx == 0 && rq && !(bq_state & 0x1)) + update_elv_direction(ad); + + return rq; +} + +// Returns the state of the batch queue page +static inline bool bq_page_has_rq(u32 bq_state, bool page) { + return bq_state & (1U << page); +} + +// Dispatch a request from the batch queues +static struct request *dispatch_from_bq(struct adios_data *ad) { + struct request *rq; + + guard(spinlock_irqsave)(&ad->bq_lock); + + u32 state = get_adios_state(ad); + u32 bq_state = eval_this_adios_state(state, ADIOS_STATE_BQ); + u32 bq_curr_page_has_rq = bq_page_has_rq(bq_state, ad->bq_page); + union adios_in_flight_rqs ifr; + ifr.scalar = atomic64_read(&ad->in_flight_rqs.atomic); + u64 tpl = ifr.total_pred_lat; + + // Refill the batch queues if the back page is empty, dl_tree has work, and + // current page is empty or the total ongoing latency is below the threshold + if (!bq_page_has_rq(bq_state, !ad->bq_page) && + (!bq_curr_page_has_rq || (!tpl || tpl < div_u64( + ad->global_latency_window * ad->bq_refill_below_ratio, 100))) && + eval_this_adios_state(state, ADIOS_STATE_DL)) + fill_batch_queues(ad, tpl); + + // If current batch queue page is empty, and the other page has work, flip + if (!bq_curr_page_has_rq && + bq_page_has_rq(eval_adios_state(ad, ADIOS_STATE_BQ), !ad->bq_page)) + flip_bq_page(ad); + + // Use the per-page state to decide the dispatch logic, ensuring correctness + rq = (ad->bq_batch_order[ad->bq_page] == ADIOS_BO_ELEVATOR) ? + pop_next_bq_request_elevator(ad): + pop_next_bq_request_optype(ad); + + if (rq) { + bool page = ad->bq_page; + bool is_empty = !ad->bq_state[page]; + if (is_empty) + set_adios_state(ad, ADIOS_STATE_BQ, page, false); + return rq; + } + + return NULL; +} + +// Dispatch a request from the priority queue +static struct request *dispatch_from_pq(struct adios_data *ad) { + struct request *rq = NULL; + + guard(spinlock_irqsave)(&ad->pq_lock); + u32 pq_state = eval_adios_state(ad, ADIOS_STATE_PQ); + u8 pq_idx = pq_state >> 1; + struct list_head *q = &ad->prio_queue[pq_idx]; + + if (unlikely(list_empty(q))) return NULL; + + rq = list_first_entry(q, struct request, queuelist); + list_del_init(&rq->queuelist); + if (list_empty(q)) { + set_adios_state(ad, ADIOS_STATE_PQ, pq_idx, false); + update_elv_direction(ad); + } + return rq; +} + +static bool release_barrier_requests(struct adios_data *ad) { + u32 moved_count = 0; + LIST_HEAD(local_list); + + scoped_guard(spinlock_irqsave, &ad->barrier_lock) { + if (!list_empty(&ad->barrier_queue)) { + struct request *trq, *next; + bool first_barrier_moved = false; + + list_for_each_entry_safe(trq, next, &ad->barrier_queue, queuelist) { + if (!first_barrier_moved) { + list_del_init(&trq->queuelist); + insert_to_prio_queue(ad, trq, 1); + moved_count++; + first_barrier_moved = true; + continue; + } + + if (trq->cmd_flags & REQ_OP_FLUSH) + break; + + list_move_tail(&trq->queuelist, &local_list); + moved_count++; + } + + if (list_empty(&ad->barrier_queue)) + set_adios_state(ad, ADIOS_STATE_BP, 0, false); + } + } + + if (!moved_count) + return false; + + if (!list_empty(&local_list)) { + struct request *trq, *next; + LIST_HEAD(free_list); + + /* ad->lock is already held */ + list_for_each_entry_safe(trq, next, &local_list, queuelist) { + list_del_init(&trq->queuelist); + if (merge_or_insert_to_dl_tree(ad, trq, ad->queue, &free_list)) + continue; + } + + if (!list_empty(&free_list)) + blk_mq_free_requests(&free_list); + } + + return true; +} + +// Dispatch a request to the hardware queue +static struct request *adios_dispatch_request(struct blk_mq_hw_ctx *hctx) { + struct adios_data *ad = hctx->queue->elevator->elevator_data; + struct request *rq; + +retry: + rq = dispatch_from_pq(ad); + if (rq) + goto found; + + rq = dispatch_from_bq(ad); + if (rq) + goto found; + + /* + * If all active queues are empty, check if we need to process a barrier. + * This is the trigger to release requests that were held in barrier_queue + * due to a REQ_OP_FLUSH barrier. + */ + if (eval_adios_state(ad, ADIOS_STATE_BP)) { + union adios_in_flight_rqs ifr; + ifr.scalar = atomic64_read(&ad->in_flight_rqs.atomic); + if (!ifr.count) { + bool barrier_released = false; + scoped_guard(spinlock_irqsave, &ad->lock) + barrier_released = release_barrier_requests(ad); + if (barrier_released) + goto retry; + } + } + + return NULL; +found: + if (ad->is_rotational) + ad->head_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); + + rq->rq_flags |= RQF_STARTED; + return rq; +} + +// Timer callback function to periodically update latency models +static void update_timer_callback(struct timer_list *t) { + struct adios_data *ad = timer_container_of(ad, t, update_timer); + + for (u8 optype = 0; optype < ADIOS_OPTYPES; optype++) + latency_model_update(ad, &ad->latency_model[optype]); +} + +// Handle the completion of a request +static void adios_completed_request(struct request *rq, u64 now) { + struct adios_data *ad = rq->q->elevator->elevator_data; + struct adios_rq_data *rd = get_rq_data(rq); + union adios_in_flight_rqs ifr = { .scalar = 0 }; + + if (rd->managed) { + union adios_in_flight_rqs ifr_to_sub = { + .count = 1, + .total_pred_lat = rd->pred_lat, + }; + ifr.scalar = atomic64_sub_return( + ifr_to_sub.scalar, &ad->in_flight_rqs.atomic); + } + u8 optype = adios_optype(rq); + + if (optype == ADIOS_OTHER) { + // Non-positional commands make the head position unpredictable. + // Invalidate our knowledge of the last completed position. + if (ad->is_rotational) + ad->last_completed_pos = 0; + return; + } + + u64 lct = ad->last_completed_time ?: rq->io_start_time_ns; + ad->last_completed_time = (ifr.count) ? now : 0; + + if (!rq->io_start_time_ns || !rd->block_size || unlikely(now < lct)) + return; + + u64 latency = now - lct; + if (latency > ad->lat_model_latency_limit) + return; + + u32 weight = 1; + if (ad->is_rotational) { + sector_t current_pos = blk_rq_pos(rq); + // Only calculate seek distance if we have a valid last position. + if (ad->last_completed_pos > 0) { + u64 seek_distance = abs( + (s64)current_pos - (s64)ad->last_completed_pos); + weight = 65 - __builtin_clzll(seek_distance); + } + // Update (or re-synchronize) our knowledge of the head position. + ad->last_completed_pos = current_pos + blk_rq_sectors(rq); + } + + latency_model_input(ad, &ad->latency_model[optype], + rd->block_size, latency, rd->pred_lat, weight); + timer_reduce(&ad->update_timer, jiffies + msecs_to_jiffies(100)); +} + +// Clean up after a request is finished +static void adios_finish_request(struct request *rq) { + struct adios_data *ad = rq->q->elevator->elevator_data; + + if (rq->elv.priv[0]) { + // Free adios_rq_data back to the memory pool + kmem_cache_free(ad->rq_data_pool, get_rq_data(rq)); + rq->elv.priv[0] = NULL; + } +} + +// Check if there are any requests available for dispatch +static bool adios_has_work(struct blk_mq_hw_ctx *hctx) { + struct adios_data *ad = hctx->queue->elevator->elevator_data; + + return atomic_read(&ad->state) != 0; +} + +// Initialize the scheduler-specific data when initializing the request queue +static int adios_init_sched(struct request_queue *q, struct elevator_queue *eq) { + struct adios_data *ad; + int ret = -ENOMEM; + u8 optype = 0; + + ad = kzalloc_node(sizeof(*ad), GFP_KERNEL, q->node); + if (!ad) { + pr_err("adios: Failed to create adios_data\n"); + goto put_eq; + } + + eq->elevator_data = ad; + + // Create a memory pool for adios_rq_data + ad->rq_data_pool = kmem_cache_create("rq_data_pool", + sizeof(struct adios_rq_data), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!ad->rq_data_pool) { + pr_err("adios: Failed to create rq_data_pool\n"); + goto free_ad; + } + + /* Create a memory pool for dl_group */ + ad->dl_group_pool = kmem_cache_create("dl_group_pool", + sizeof(struct dl_group), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!ad->dl_group_pool) { + pr_err("adios: Failed to create dl_group_pool\n"); + goto destroy_rq_data_pool; + } + + for (int i = 0; i < ADIOS_PQ_LEVELS; i++) + INIT_LIST_HEAD(&ad->prio_queue[i]); + + for (u8 i = 0; i < ADIOS_DL_TYPES; i++) { + ad->dl_tree[i] = RB_ROOT_CACHED; + ad->dl_prio[i] = default_dl_prio[i]; + } + ad->dl_bias = 0; + + for (u8 page = 0; page < ADIOS_BQ_PAGES; page++) + for (optype = 0; optype < ADIOS_OPTYPES; optype++) + INIT_LIST_HEAD(&ad->batch_queue[page][optype]); + + ad->aggr_buckets = kzalloc(sizeof(*ad->aggr_buckets), GFP_KERNEL); + if (!ad->aggr_buckets) { + pr_err("adios: Failed to allocate aggregation buckets\n"); + goto destroy_dl_group_pool; + } + + for (optype = 0; optype < ADIOS_OPTYPES; optype++) { + struct latency_model *model = &ad->latency_model[optype]; + struct latency_model_params *params; + + spin_lock_init(&model->update_lock); + params = kzalloc(sizeof(*params), GFP_KERNEL); + if (!params) { + pr_err("adios: Failed to allocate latency_model_params\n"); + goto free_buckets; + } + params->last_update_jiffies = jiffies; + RCU_INIT_POINTER(model->params, params); + + model->pcpu_buckets = alloc_percpu(struct lm_buckets); + if (!model->pcpu_buckets) { + pr_err("adios: Failed to allocate per-CPU buckets\n"); + kfree(params); + goto free_buckets; + } + + model->lm_shrink_at_kreqs = default_lm_shrink_at_kreqs; + model->lm_shrink_at_gbytes = default_lm_shrink_at_gbytes; + model->lm_shrink_resist = default_lm_shrink_resist; + } + + for (optype = 0; optype < ADIOS_OPTYPES; optype++) { + ad->latency_target[optype] = default_latency_target[optype]; + ad->batch_limit[optype] = default_batch_limit[optype]; + } + + eq->elevator_data = ad; + + ad->is_rotational = !!(q->limits.features & BLK_FEAT_ROTATIONAL); + ad->global_latency_window = (ad->is_rotational)? + default_global_latency_window_rotational: + default_global_latency_window; + ad->bq_refill_below_ratio = default_bq_refill_below_ratio; + ad->lat_model_latency_limit = default_lat_model_latency_limit; + ad->batch_order = default_batch_order; + ad->compliance_flags = default_compliance_flags; + + ad->insert_request_fn = insert_request_pre_stability; + + atomic_set(&ad->state, 0); + + spin_lock_init(&ad->lock); + spin_lock_init(&ad->pq_lock); + spin_lock_init(&ad->bq_lock); + spin_lock_init(&ad->barrier_lock); + INIT_LIST_HEAD(&ad->barrier_queue); + + timer_setup(&ad->update_timer, update_timer_callback, 0); + + /* We dispatch from request queue wide instead of hw queue */ + blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q); + + ad->queue = q; + blk_stat_enable_accounting(q); + + q->elevator = eq; + adios_depth_updated(q); + return 0; + +free_buckets: + pr_err("adios: Failed to allocate per-cpu buckets\n"); + while (optype-- > 0) { + struct latency_model *prev_model = &ad->latency_model[optype]; + kfree(rcu_access_pointer(prev_model->params)); + free_percpu(prev_model->pcpu_buckets); + } + kfree(ad->aggr_buckets); +destroy_dl_group_pool: + kmem_cache_destroy(ad->dl_group_pool); +destroy_rq_data_pool: + kmem_cache_destroy(ad->rq_data_pool); +free_ad: + kfree(ad); +put_eq: + kobject_put(&eq->kobj); + return ret; +} + +// Clean up and free resources when exiting the scheduler +static void adios_exit_sched(struct elevator_queue *e) { + struct adios_data *ad = e->elevator_data; + + timer_shutdown_sync(&ad->update_timer); + + WARN_ON_ONCE(!list_empty(&ad->barrier_queue)); + for (int i = 0; i < 2; i++) + WARN_ON_ONCE(!list_empty(&ad->prio_queue[i])); + + for (u8 i = 0; i < ADIOS_OPTYPES; i++) { + struct latency_model *model = &ad->latency_model[i]; + struct latency_model_params *params = rcu_access_pointer(model->params); + + RCU_INIT_POINTER(model->params, NULL); + kfree_rcu(params, rcu); + + free_percpu(model->pcpu_buckets); + } + + synchronize_rcu(); + + kfree(ad->aggr_buckets); + + if (ad->rq_data_pool) + kmem_cache_destroy(ad->rq_data_pool); + + if (ad->dl_group_pool) + kmem_cache_destroy(ad->dl_group_pool); + + blk_stat_disable_accounting(ad->queue); + + kfree(ad); +} + +static void sideload_latency_model( + struct latency_model *model, u64 base, u64 slope) { + struct latency_model_params *old_params, *new_params; + unsigned long flags; + + new_params = kzalloc(sizeof(*new_params), GFP_KERNEL); + if (!new_params) + return; + + spin_lock_irqsave(&model->update_lock, flags); + + old_params = rcu_dereference_protected(model->params, + lockdep_is_held(&model->update_lock)); + + new_params->last_update_jiffies = jiffies; + + // Initialize base and its statistics as a single sample. + new_params->base = base; + new_params->small_sum_delay = base; + new_params->small_count = 1; + + // Initialize slope and its statistics as a single sample. + new_params->slope = slope; + new_params->large_sum_delay = slope; + new_params->large_sum_bsize = 1024; /* Corresponds to 1 KiB */ + + lm_reset_pcpu_buckets(model); + + rcu_assign_pointer(model->params, new_params); + spin_unlock_irqrestore(&model->update_lock, flags); + + kfree_rcu(old_params, rcu); +} + +// Define sysfs attributes for operation types +#define SYSFS_OPTYPE_DECL(name, optype) \ +static ssize_t adios_lat_model_##name##_show( \ + struct elevator_queue *e, char *page) { \ + struct adios_data *ad = e->elevator_data; \ + struct latency_model *model = &ad->latency_model[optype]; \ + struct latency_model_params *params; \ + ssize_t len = 0; \ + u64 base, slope; \ + rcu_read_lock(); \ + params = rcu_dereference(model->params); \ + base = params->base; \ + slope = params->slope; \ + rcu_read_unlock(); \ + len += sprintf(page, "base : %llu ns\n", base); \ + len += sprintf(page + len, "slope: %llu ns/KiB\n", slope); \ + return len; \ +} \ +static ssize_t adios_lat_model_##name##_store( \ + struct elevator_queue *e, const char *page, size_t count) { \ + struct adios_data *ad = e->elevator_data; \ + struct latency_model *model = &ad->latency_model[optype]; \ + u64 base, slope; \ + int ret; \ + ret = sscanf(page, "%llu %llu", &base, &slope); \ + if (ret != 2) \ + return -EINVAL; \ + sideload_latency_model(model, base, slope); \ + reset_buckets(ad->aggr_buckets); \ + return count; \ +} \ +static ssize_t adios_lat_target_##name##_show( \ + struct elevator_queue *e, char *page) { \ + struct adios_data *ad = e->elevator_data; \ + return sprintf(page, "%llu\n", ad->latency_target[optype]); \ +} \ +static ssize_t adios_lat_target_##name##_store( \ + struct elevator_queue *e, const char *page, size_t count) { \ + struct adios_data *ad = e->elevator_data; \ + unsigned long nsec; \ + int ret; \ + ret = kstrtoul(page, 10, &nsec); \ + if (ret) \ + return ret; \ + sideload_latency_model(&ad->latency_model[optype], 0, 0); \ + ad->latency_target[optype] = nsec; \ + return count; \ +} \ +static ssize_t adios_batch_limit_##name##_show( \ + struct elevator_queue *e, char *page) { \ + struct adios_data *ad = e->elevator_data; \ + return sprintf(page, "%u\n", ad->batch_limit[optype]); \ +} \ +static ssize_t adios_batch_limit_##name##_store( \ + struct elevator_queue *e, const char *page, size_t count) { \ + unsigned long max_batch; \ + int ret; \ + ret = kstrtoul(page, 10, &max_batch); \ + if (ret || max_batch == 0) \ + return -EINVAL; \ + struct adios_data *ad = e->elevator_data; \ + ad->batch_limit[optype] = max_batch; \ + return count; \ +} + +SYSFS_OPTYPE_DECL(read, ADIOS_READ); +SYSFS_OPTYPE_DECL(write, ADIOS_WRITE); +SYSFS_OPTYPE_DECL(discard, ADIOS_DISCARD); + +// Show the maximum batch size actually achieved for each operation type +static ssize_t adios_batch_actual_max_show( + struct elevator_queue *e, char *page) { + struct adios_data *ad = e->elevator_data; + u32 total_count, read_count, write_count, discard_count; + + total_count = ad->batch_actual_max_total; + read_count = ad->batch_actual_max_size[ADIOS_READ]; + write_count = ad->batch_actual_max_size[ADIOS_WRITE]; + discard_count = ad->batch_actual_max_size[ADIOS_DISCARD]; + + return sprintf(page, + "Total : %u\nDiscard: %u\nRead : %u\nWrite : %u\n", + total_count, discard_count, read_count, write_count); +} + +#define SYSFS_ULL_DECL(field, min_val, max_val) \ +static ssize_t adios_##field##_show( \ + struct elevator_queue *e, char *page) { \ + struct adios_data *ad = e->elevator_data; \ + return sprintf(page, "%llu\n", ad->field); \ +} \ +static ssize_t adios_##field##_store( \ + struct elevator_queue *e, const char *page, size_t count) { \ + struct adios_data *ad = e->elevator_data; \ + unsigned long val; \ + int ret; \ + ret = kstrtoul(page, 10, &val); \ + if (ret || val < (min_val) || val > (max_val)) \ + return -EINVAL; \ + ad->field = val; \ + return count; \ +} + +SYSFS_ULL_DECL(global_latency_window, 0, ULLONG_MAX) +SYSFS_ULL_DECL(compliance_flags, 0, ULLONG_MAX) + +#define SYSFS_INT_DECL(field, min_val, max_val) \ +static ssize_t adios_##field##_show( \ + struct elevator_queue *e, char *page) { \ + struct adios_data *ad = e->elevator_data; \ + return sprintf(page, "%d\n", ad->field); \ +} \ +static ssize_t adios_##field##_store( \ + struct elevator_queue *e, const char *page, size_t count) { \ + struct adios_data *ad = e->elevator_data; \ + int val; \ + int ret; \ + ret = kstrtoint(page, 10, &val); \ + if (ret || val < (min_val) || val > (max_val)) \ + return -EINVAL; \ + ad->field = val; \ + return count; \ +} + +SYSFS_INT_DECL(bq_refill_below_ratio, 0, 100) +SYSFS_INT_DECL(lat_model_latency_limit, 0, 2*NSEC_PER_SEC) +SYSFS_INT_DECL(batch_order, ADIOS_BO_OPTYPE, !!ad->is_rotational) + +// Show the read priority +static ssize_t adios_read_priority_show( + struct elevator_queue *e, char *page) { + struct adios_data *ad = e->elevator_data; + return sprintf(page, "%d\n", ad->dl_prio[0]); +} + +// Set the read priority +static ssize_t adios_read_priority_store( + struct elevator_queue *e, const char *page, size_t count) { + struct adios_data *ad = e->elevator_data; + int prio; + int ret; + + ret = kstrtoint(page, 10, &prio); + if (ret || prio < -20 || prio > 19) + return -EINVAL; + + guard(spinlock_irqsave)(&ad->lock); + ad->dl_prio[0] = prio; + ad->dl_bias = 0; + + return count; +} + +// Reset batch queue statistics +static ssize_t adios_reset_bq_stats_store( + struct elevator_queue *e, const char *page, size_t count) { + struct adios_data *ad = e->elevator_data; + unsigned long val; + int ret; + + ret = kstrtoul(page, 10, &val); + if (ret || val != 1) + return -EINVAL; + + for (u8 i = 0; i < ADIOS_OPTYPES; i++) + ad->batch_actual_max_size[i] = 0; + + ad->batch_actual_max_total = 0; + + return count; +} + +// Reset the latency model parameters or load them from user input +static ssize_t adios_reset_lat_model_store( + struct elevator_queue *e, const char *page, size_t count) +{ + struct adios_data *ad = e->elevator_data; + struct latency_model *model; + int ret; + + /* + * Differentiate between two modes based on input format: + * 1. "1": Fully reset the model (backward compatibility). + * 2. "R_base R_slope W_base W_slope D_base D_slope": Load values. + */ + if (!strchr(page, ' ')) { + // Mode 1: Full reset. + unsigned long val; + + ret = kstrtoul(page, 10, &val); + if (ret || val != 1) + return -EINVAL; + + for (u8 i = 0; i < ADIOS_OPTYPES; i++) { + model = &ad->latency_model[i]; + sideload_latency_model(model, 0, 0); + } + } else { + // Mode 2: Load initial values for all latency models. + u64 params[3][2]; /* 0:base, 1:slope for R, W, D */ + + ret = sscanf(page, "%llu %llu %llu %llu %llu %llu", + ¶ms[ADIOS_READ ][0], ¶ms[ADIOS_READ ][1], + ¶ms[ADIOS_WRITE ][0], ¶ms[ADIOS_WRITE ][1], + ¶ms[ADIOS_DISCARD][0], ¶ms[ADIOS_DISCARD][1]); + + if (ret != 6) + return -EINVAL; + + for (u8 i = ADIOS_READ; i <= ADIOS_DISCARD; i++) { + model = &ad->latency_model[i]; + sideload_latency_model(model, params[i][0], params[i][1]); + } + } + reset_buckets(ad->aggr_buckets); + + return count; +} + +// Show the ADIOS version +static ssize_t adios_version_show(struct elevator_queue *e, char *page) { + return sprintf(page, "%s\n", ADIOS_VERSION); +} + +// Define sysfs attributes for dynamic thresholds +#define SHRINK_THRESHOLD_ATTR_RW(name, model_field, min_value, max_value) \ +static ssize_t adios_shrink_##name##_store( \ + struct elevator_queue *e, const char *page, size_t count) { \ + struct adios_data *ad = e->elevator_data; \ + unsigned long val; \ + int ret; \ + ret = kstrtoul(page, 10, &val); \ + if (ret || val < min_value || val > max_value) \ + return -EINVAL; \ + for (u8 i = 0; i < ADIOS_OPTYPES; i++) { \ + struct latency_model *model = &ad->latency_model[i]; \ + unsigned long flags; \ + spin_lock_irqsave(&model->update_lock, flags); \ + model->model_field = val; \ + spin_unlock_irqrestore(&model->update_lock, flags); \ + } \ + return count; \ +} \ +static ssize_t adios_shrink_##name##_show( \ + struct elevator_queue *e, char *page) { \ + struct adios_data *ad = e->elevator_data; \ + u32 val = 0; \ + unsigned long flags; \ + struct latency_model *model = &ad->latency_model[0]; \ + spin_lock_irqsave(&model->update_lock, flags); \ + val = model->model_field; \ + spin_unlock_irqrestore(&model->update_lock, flags); \ + return sprintf(page, "%u\n", val); \ +} + +SHRINK_THRESHOLD_ATTR_RW(at_kreqs, lm_shrink_at_kreqs, 1, 100000) +SHRINK_THRESHOLD_ATTR_RW(at_gbytes, lm_shrink_at_gbytes, 1, 1000) +SHRINK_THRESHOLD_ATTR_RW(resist, lm_shrink_resist, 1, 3) + +// Define sysfs attributes +#define AD_ATTR(name, show_func, store_func) \ + __ATTR(name, 0644, show_func, store_func) +#define AD_ATTR_RW(name) \ + __ATTR(name, 0644, adios_##name##_show, adios_##name##_store) +#define AD_ATTR_RO(name) \ + __ATTR(name, 0444, adios_##name##_show, NULL) +#define AD_ATTR_WO(name) \ + __ATTR(name, 0200, NULL, adios_##name##_store) + +// Define sysfs attributes for ADIOS scheduler +static struct elv_fs_entry adios_sched_attrs[] = { + AD_ATTR_RO(batch_actual_max), + AD_ATTR_RW(bq_refill_below_ratio), + AD_ATTR_RW(global_latency_window), + AD_ATTR_RW(lat_model_latency_limit), + AD_ATTR_RW(batch_order), + AD_ATTR_RW(compliance_flags), + + AD_ATTR_RW(batch_limit_read), + AD_ATTR_RW(batch_limit_write), + AD_ATTR_RW(batch_limit_discard), + + AD_ATTR_RW(lat_model_read), + AD_ATTR_RW(lat_model_write), + AD_ATTR_RW(lat_model_discard), + + AD_ATTR_RW(lat_target_read), + AD_ATTR_RW(lat_target_write), + AD_ATTR_RW(lat_target_discard), + + AD_ATTR_RW(shrink_at_kreqs), + AD_ATTR_RW(shrink_at_gbytes), + AD_ATTR_RW(shrink_resist), + + AD_ATTR_RW(read_priority), + + AD_ATTR_WO(reset_bq_stats), + AD_ATTR_WO(reset_lat_model), + AD_ATTR(adios_version, adios_version_show, NULL), + + __ATTR_NULL +}; + +// Define the ADIOS scheduler type +static struct elevator_type mq_adios = { + .ops = { + .next_request = elv_rb_latter_request, + .former_request = elv_rb_former_request, + .limit_depth = adios_limit_depth, + .depth_updated = adios_depth_updated, + .request_merged = adios_request_merged, + .requests_merged = adios_merged_requests, + .bio_merge = adios_bio_merge, + .insert_requests = adios_insert_requests, + .prepare_request = adios_prepare_request, + .dispatch_request = adios_dispatch_request, + .completed_request = adios_completed_request, + .finish_request = adios_finish_request, + .has_work = adios_has_work, + .init_sched = adios_init_sched, + .exit_sched = adios_exit_sched, + }, + .elevator_attrs = adios_sched_attrs, + .elevator_name = "adios", + .elevator_owner = THIS_MODULE, +}; +MODULE_ALIAS("mq-adios-iosched"); + +#define ADIOS_PROGNAME "Adaptive Deadline I/O Scheduler" +#define ADIOS_AUTHOR "Masahito Suzuki" + +// Initialize the ADIOS scheduler module +static int __init adios_init(void) { + printk(KERN_INFO "%s %s by %s\n", + ADIOS_PROGNAME, ADIOS_VERSION, ADIOS_AUTHOR); + return elv_register(&mq_adios); +} + +// Exit the ADIOS scheduler module +static void __exit adios_exit(void) { + elv_unregister(&mq_adios); +} + +module_init(adios_init); +module_exit(adios_exit); + +MODULE_AUTHOR(ADIOS_AUTHOR); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION(ADIOS_PROGNAME); \ No newline at end of file diff --git a/block/elevator.c b/block/elevator.c index e2ebfbf107b3af..49ede90c131d3b 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -741,7 +741,13 @@ void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e, void elevator_set_default(struct request_queue *q) { struct elv_change_ctx ctx = { +#ifdef CONFIG_MQ_IOSCHED_DEFAULT_ADIOS + .name = "adios", +#elif defined(CONFIG_ZEN_INTERACTIVE) && defined(CONFIG_IOSCHED_BFQ) + .name = "bfq", +#else .name = "mq-deadline", +#endif .no_uevent = true, }; int err; @@ -758,17 +764,25 @@ void elevator_set_default(struct request_queue *q) * have multiple queues or mq-deadline is not available, default * to "none". */ +#ifndef CONFIG_MQ_IOSCHED_DEFAULT_ADIOS + bool is_sq = q->nr_hw_queues == 1 || blk_mq_is_shared_tags(q->tag_set->flags); + if (!is_sq) +#if defined(CONFIG_ZEN_INTERACTIVE) && defined(CONFIG_MQ_IOSCHED_KYBER) + ctx.name = "kyber"; +#else + return; +#endif +#endif // CONFIG_MQ_IOSCHED_DEFAULT_ADIOS + e = elevator_find_get(ctx.name); if (!e) return; - if ((q->nr_hw_queues == 1 || - blk_mq_is_shared_tags(q->tag_set->flags))) { - err = elevator_change(q, &ctx); - if (err < 0) - pr_warn("\"%s\" elevator initialization, failed %d, falling back to \"none\"\n", - ctx.name, err); - } + err = elevator_change(q, &ctx); + if (err < 0) + pr_warn("\"%s\" elevator initialization, failed %d, falling back to \"none\"\n", + ctx.name, err); + elevator_put(e); } diff --git a/drivers/Makefile b/drivers/Makefile index 8e1ffa4358d5f1..2a9eec99c1f7c3 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -64,14 +64,8 @@ obj-y += char/ # iommu/ comes before gpu as gpu are using iommu controllers obj-y += iommu/ -# gpu/ comes after char for AGP vs DRM startup and after iommu -obj-y += gpu/ - obj-$(CONFIG_CONNECTOR) += connector/ -# i810fb depends on char/agp/ -obj-$(CONFIG_FB_I810) += video/fbdev/i810/ - obj-$(CONFIG_PARPORT) += parport/ obj-y += base/ block/ misc/ mfd/ nfc/ obj-$(CONFIG_LIBNVDIMM) += nvdimm/ @@ -83,6 +77,13 @@ obj-y += macintosh/ obj-y += scsi/ obj-y += nvme/ obj-$(CONFIG_ATA) += ata/ + +# gpu/ comes after char for AGP vs DRM startup and after iommu +obj-y += gpu/ + +# i810fb depends on char/agp/ +obj-$(CONFIG_FB_I810) += video/fbdev/i810/ + obj-$(CONFIG_TARGET_CORE) += target/ obj-$(CONFIG_MTD) += mtd/ obj-$(CONFIG_SPI) += spi/ diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c index 7a7f88b3fa2b18..cb26ab099da2bd 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c @@ -1672,7 +1672,7 @@ static irqreturn_t ahci_thunderx_irq_handler(int irq, void *dev_instance) } #endif -static void ahci_remap_check(struct pci_dev *pdev, int bar, +static int ahci_remap_check(struct pci_dev *pdev, int bar, struct ahci_host_priv *hpriv) { int i; @@ -1685,7 +1685,7 @@ static void ahci_remap_check(struct pci_dev *pdev, int bar, pci_resource_len(pdev, bar) < SZ_512K || bar != AHCI_PCI_BAR_STANDARD || !(readl(hpriv->mmio + AHCI_VSCAP) & 1)) - return; + return 0; cap = readq(hpriv->mmio + AHCI_REMAP_CAP); for (i = 0; i < AHCI_MAX_REMAP; i++) { @@ -1700,18 +1700,11 @@ static void ahci_remap_check(struct pci_dev *pdev, int bar, } if (!hpriv->remapped_nvme) - return; - - dev_warn(&pdev->dev, "Found %u remapped NVMe devices.\n", - hpriv->remapped_nvme); - dev_warn(&pdev->dev, - "Switch your BIOS from RAID to AHCI mode to use them.\n"); + return 0; - /* - * Don't rely on the msi-x capability in the remap case, - * share the legacy interrupt across ahci and remapped devices. - */ - hpriv->flags |= AHCI_HFLAG_NO_MSI; + /* Abort probe, allowing intel-nvme-remap to step in when available */ + dev_info(&pdev->dev, "Device will be handled by intel-nvme-remap.\n"); + return -ENODEV; } static int ahci_get_irq_vector(struct ata_host *host, int port) @@ -1975,7 +1968,9 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) return -ENOMEM; /* detect remapped nvme devices */ - ahci_remap_check(pdev, ahci_pci_bar, hpriv); + rc = ahci_remap_check(pdev, ahci_pci_bar, hpriv); + if (rc) + return rc; sysfs_add_file_to_group(&pdev->dev.kobj, &dev_attr_remapped_nvme.attr, diff --git a/drivers/ata/libahci.c b/drivers/ata/libahci.c index c79abdfcd7a9b0..59acfa77934b9c 100644 --- a/drivers/ata/libahci.c +++ b/drivers/ata/libahci.c @@ -34,7 +34,7 @@ #include "libata.h" static int ahci_skip_host_reset; -int ahci_ignore_sss; +int ahci_ignore_sss = 1; EXPORT_SYMBOL_GPL(ahci_ignore_sss); module_param_named(skip_host_reset, ahci_skip_host_reset, int, 0444); diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c index a6ecc203f7b7f3..46ea23cbb75437 100644 --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c @@ -18,10 +18,16 @@ #include "cpufreq_ondemand.h" /* On-demand governor macros */ +#if defined(CONFIG_ZEN_INTERACTIVE) +#define DEF_FREQUENCY_UP_THRESHOLD (55) +#define MICRO_FREQUENCY_UP_THRESHOLD (60) +#define DEF_SAMPLING_DOWN_FACTOR (5) +#else #define DEF_FREQUENCY_UP_THRESHOLD (80) +#define MICRO_FREQUENCY_UP_THRESHOLD (95) #define DEF_SAMPLING_DOWN_FACTOR (1) +#endif #define MAX_SAMPLING_DOWN_FACTOR (100000) -#define MICRO_FREQUENCY_UP_THRESHOLD (95) #define MIN_FREQUENCY_UP_THRESHOLD (1) #define MAX_FREQUENCY_UP_THRESHOLD (100) diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c index 90ff6be85cf466..15159c1cf6e1a0 100644 --- a/drivers/input/evdev.c +++ b/drivers/input/evdev.c @@ -46,6 +46,7 @@ struct evdev_client { struct fasync_struct *fasync; struct evdev *evdev; struct list_head node; + struct rcu_head rcu; enum input_clock_type clk_type; bool revoked; unsigned long *evmasks[EV_CNT]; @@ -368,13 +369,22 @@ static void evdev_attach_client(struct evdev *evdev, spin_unlock(&evdev->client_lock); } +static void evdev_reclaim_client(struct rcu_head *rp) +{ + struct evdev_client *client = container_of(rp, struct evdev_client, rcu); + unsigned int i; + for (i = 0; i < EV_CNT; ++i) + bitmap_free(client->evmasks[i]); + kvfree(client); +} + static void evdev_detach_client(struct evdev *evdev, struct evdev_client *client) { spin_lock(&evdev->client_lock); list_del_rcu(&client->node); spin_unlock(&evdev->client_lock); - synchronize_rcu(); + call_rcu(&client->rcu, evdev_reclaim_client); } static int evdev_open_device(struct evdev *evdev) @@ -427,7 +437,6 @@ static int evdev_release(struct inode *inode, struct file *file) { struct evdev_client *client = file->private_data; struct evdev *evdev = client->evdev; - unsigned int i; mutex_lock(&evdev->mutex); @@ -439,11 +448,6 @@ static int evdev_release(struct inode *inode, struct file *file) evdev_detach_client(evdev, client); - for (i = 0; i < EV_CNT; ++i) - bitmap_free(client->evmasks[i]); - - kvfree(client); - evdev_close_device(evdev); return 0; @@ -486,7 +490,6 @@ static int evdev_open(struct inode *inode, struct file *file) err_free_client: evdev_detach_client(evdev, client); - kvfree(client); return error; } diff --git a/drivers/pci/controller/Makefile b/drivers/pci/controller/Makefile index 038ccbd9e3ba23..de5e4f5145af8d 100644 --- a/drivers/pci/controller/Makefile +++ b/drivers/pci/controller/Makefile @@ -1,4 +1,10 @@ # SPDX-License-Identifier: GPL-2.0 +ifdef CONFIG_X86_64 +ifdef CONFIG_SATA_AHCI +obj-y += intel-nvme-remap.o +endif +endif + obj-$(CONFIG_PCIE_CADENCE) += cadence/ obj-$(CONFIG_PCI_FTPCI100) += pci-ftpci100.o obj-$(CONFIG_PCI_IXP4XX) += pci-ixp4xx.o diff --git a/drivers/pci/controller/intel-nvme-remap.c b/drivers/pci/controller/intel-nvme-remap.c new file mode 100644 index 00000000000000..e105e6f5cc91d1 --- /dev/null +++ b/drivers/pci/controller/intel-nvme-remap.c @@ -0,0 +1,462 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel remapped NVMe device support. + * + * Copyright (c) 2019 Endless Mobile, Inc. + * Author: Daniel Drake + * + * Some products ship by default with the SATA controller in "RAID" or + * "Intel RST Premium With Intel Optane System Acceleration" mode. Under this + * mode, which we refer to as "remapped NVMe" mode, any installed NVMe + * devices disappear from the PCI bus, and instead their I/O memory becomes + * available within the AHCI device BARs. + * + * This scheme is understood to be a way of avoiding usage of the standard + * Windows NVMe driver under that OS, instead mandating usage of Intel's + * driver instead, which has better power management, and presumably offers + * some RAID/disk-caching solutions too. + * + * Here in this driver, we support the remapped NVMe mode by claiming the + * AHCI device and creating a fake PCIe root port. On the new bus, the + * original AHCI device is exposed with only minor tweaks. Then, fake PCI + * devices corresponding to the remapped NVMe devices are created. The usual + * ahci and nvme drivers are then expected to bind to these devices and + * operate as normal. + * + * The PCI configuration space for the NVMe devices is completely + * unavailable, so we fake a minimal one and hope for the best. + * + * Interrupts are shared between the AHCI and NVMe devices. For simplicity, + * we only support the legacy interrupt here, although MSI support + * could potentially be added later. + */ + +#define MODULE_NAME "intel-nvme-remap" + +#include +#include +#include +#include +#include + +#define AHCI_PCI_BAR_STANDARD 5 + +struct nvme_remap_dev { + struct pci_dev *dev; /* AHCI device */ + struct pci_bus *bus; /* our fake PCI bus */ + struct pci_sysdata sysdata; + int irq_base; /* our fake interrupts */ + + /* + * When we detect an all-ones write to a BAR register, this flag + * is set, so that we return the BAR size on the next read (a + * standard PCI behaviour). + * This includes the assumption that an all-ones BAR write is + * immediately followed by a read of the same register. + */ + bool bar_sizing; + + /* + * Resources copied from the AHCI device, to be regarded as + * resources on our fake bus. + */ + struct resource ahci_resources[PCI_NUM_RESOURCES]; + + /* Resources corresponding to the NVMe devices. */ + struct resource remapped_dev_mem[AHCI_MAX_REMAP]; + + /* Number of remapped NVMe devices found. */ + int num_remapped_devices; +}; + +static inline struct nvme_remap_dev *nrdev_from_bus(struct pci_bus *bus) +{ + return container_of(bus->sysdata, struct nvme_remap_dev, sysdata); +} + + +/******** PCI configuration space **********/ + +/* + * Helper macros for tweaking returned contents of PCI configuration space. + * + * value contains len bytes of data read from reg. + * If fixup_reg is included in that range, fix up the contents of that + * register to fixed_value. + */ +#define NR_FIX8(fixup_reg, fixed_value) do { \ + if (reg <= fixup_reg && fixup_reg < reg + len) \ + ((u8 *) value)[fixup_reg - reg] = (u8) (fixed_value); \ + } while (0) + +#define NR_FIX16(fixup_reg, fixed_value) do { \ + NR_FIX8(fixup_reg, fixed_value); \ + NR_FIX8(fixup_reg + 1, fixed_value >> 8); \ + } while (0) + +#define NR_FIX24(fixup_reg, fixed_value) do { \ + NR_FIX8(fixup_reg, fixed_value); \ + NR_FIX8(fixup_reg + 1, fixed_value >> 8); \ + NR_FIX8(fixup_reg + 2, fixed_value >> 16); \ + } while (0) + +#define NR_FIX32(fixup_reg, fixed_value) do { \ + NR_FIX16(fixup_reg, (u16) fixed_value); \ + NR_FIX16(fixup_reg + 2, fixed_value >> 16); \ + } while (0) + +/* + * Read PCI config space of the slot 0 (AHCI) device. + * We pass through the read request to the underlying device, but + * tweak the results in some cases. + */ +static int nvme_remap_pci_read_slot0(struct pci_bus *bus, int reg, + int len, u32 *value) +{ + struct nvme_remap_dev *nrdev = nrdev_from_bus(bus); + struct pci_bus *ahci_dev_bus = nrdev->dev->bus; + int ret; + + ret = ahci_dev_bus->ops->read(ahci_dev_bus, nrdev->dev->devfn, + reg, len, value); + if (ret) + return ret; + + /* + * Adjust the device class, to prevent this driver from attempting to + * additionally probe the device we're simulating here. + */ + NR_FIX24(PCI_CLASS_PROG, PCI_CLASS_STORAGE_SATA_AHCI); + + /* + * Unset interrupt pin, otherwise ACPI tries to find routing + * info for our virtual IRQ, fails, and complains. + */ + NR_FIX8(PCI_INTERRUPT_PIN, 0); + + /* + * Truncate the AHCI BAR to not include the region that covers the + * hidden devices. This will cause the ahci driver to successfully + * probe th new device (instead of handing it over to this driver). + */ + if (nrdev->bar_sizing) { + NR_FIX32(PCI_BASE_ADDRESS_5, ~(SZ_16K - 1)); + nrdev->bar_sizing = false; + } + + return PCIBIOS_SUCCESSFUL; +} + +/* + * Read PCI config space of a remapped device. + * Since the original PCI config space is inaccessible, we provide a minimal, + * fake config space instead. + */ +static int nvme_remap_pci_read_remapped(struct pci_bus *bus, unsigned int port, + int reg, int len, u32 *value) +{ + struct nvme_remap_dev *nrdev = nrdev_from_bus(bus); + struct resource *remapped_mem; + + if (port > nrdev->num_remapped_devices) + return PCIBIOS_DEVICE_NOT_FOUND; + + *value = 0; + remapped_mem = &nrdev->remapped_dev_mem[port - 1]; + + /* Set a Vendor ID, otherwise Linux assumes no device is present */ + NR_FIX16(PCI_VENDOR_ID, PCI_VENDOR_ID_INTEL); + + /* Always appear on & bus mastering */ + NR_FIX16(PCI_COMMAND, PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER); + + /* Set class so that nvme driver probes us */ + NR_FIX24(PCI_CLASS_PROG, PCI_CLASS_STORAGE_EXPRESS); + + if (nrdev->bar_sizing) { + NR_FIX32(PCI_BASE_ADDRESS_0, + ~(resource_size(remapped_mem) - 1)); + nrdev->bar_sizing = false; + } else { + resource_size_t mem_start = remapped_mem->start; + + mem_start |= PCI_BASE_ADDRESS_MEM_TYPE_64; + NR_FIX32(PCI_BASE_ADDRESS_0, mem_start); + mem_start >>= 32; + NR_FIX32(PCI_BASE_ADDRESS_1, mem_start); + } + + return PCIBIOS_SUCCESSFUL; +} + +/* Read PCI configuration space. */ +static int nvme_remap_pci_read(struct pci_bus *bus, unsigned int devfn, + int reg, int len, u32 *value) +{ + if (PCI_SLOT(devfn) == 0) + return nvme_remap_pci_read_slot0(bus, reg, len, value); + else + return nvme_remap_pci_read_remapped(bus, PCI_SLOT(devfn), + reg, len, value); +} + +/* + * Write PCI config space of the slot 0 (AHCI) device. + * Apart from the special case of BAR sizing, we disable all writes. + * Otherwise, the ahci driver could make changes (e.g. unset PCI bus master) + * that would affect the operation of the NVMe devices. + */ +static int nvme_remap_pci_write_slot0(struct pci_bus *bus, int reg, + int len, u32 value) +{ + struct nvme_remap_dev *nrdev = nrdev_from_bus(bus); + struct pci_bus *ahci_dev_bus = nrdev->dev->bus; + + if (reg >= PCI_BASE_ADDRESS_0 && reg <= PCI_BASE_ADDRESS_5) { + /* + * Writing all-ones to a BAR means that the size of the + * memory region is being checked. Flag this so that we can + * reply with an appropriate size on the next read. + */ + if (value == ~0) + nrdev->bar_sizing = true; + + return ahci_dev_bus->ops->write(ahci_dev_bus, + nrdev->dev->devfn, + reg, len, value); + } + + return PCIBIOS_SET_FAILED; +} + +/* + * Write PCI config space of a remapped device. + * Since the original PCI config space is inaccessible, we reject all + * writes, except for the special case of BAR probing. + */ +static int nvme_remap_pci_write_remapped(struct pci_bus *bus, + unsigned int port, + int reg, int len, u32 value) +{ + struct nvme_remap_dev *nrdev = nrdev_from_bus(bus); + + if (port > nrdev->num_remapped_devices) + return PCIBIOS_DEVICE_NOT_FOUND; + + /* + * Writing all-ones to a BAR means that the size of the memory + * region is being checked. Flag this so that we can reply with + * an appropriate size on the next read. + */ + if (value == ~0 && reg >= PCI_BASE_ADDRESS_0 + && reg <= PCI_BASE_ADDRESS_5) { + nrdev->bar_sizing = true; + return PCIBIOS_SUCCESSFUL; + } + + return PCIBIOS_SET_FAILED; +} + +/* Write PCI configuration space. */ +static int nvme_remap_pci_write(struct pci_bus *bus, unsigned int devfn, + int reg, int len, u32 value) +{ + if (PCI_SLOT(devfn) == 0) + return nvme_remap_pci_write_slot0(bus, reg, len, value); + else + return nvme_remap_pci_write_remapped(bus, PCI_SLOT(devfn), + reg, len, value); +} + +static struct pci_ops nvme_remap_pci_ops = { + .read = nvme_remap_pci_read, + .write = nvme_remap_pci_write, +}; + + +/******** Initialization & exit **********/ + +/* + * Find a PCI domain ID to use for our fake bus. + * Start at 0x10000 to not clash with ACPI _SEG domains (16 bits). + */ +static int find_free_domain(void) +{ + int domain = 0xffff; + struct pci_bus *bus = NULL; + + while ((bus = pci_find_next_bus(bus)) != NULL) + domain = max_t(int, domain, pci_domain_nr(bus)); + + return domain + 1; +} + +static int find_remapped_devices(struct nvme_remap_dev *nrdev, + struct list_head *resources) +{ + void __iomem *mmio; + int i, count = 0; + u32 cap; + + mmio = pcim_iomap(nrdev->dev, AHCI_PCI_BAR_STANDARD, + pci_resource_len(nrdev->dev, + AHCI_PCI_BAR_STANDARD)); + if (!mmio) + return -ENODEV; + + /* Check if this device might have remapped nvme devices. */ + if (pci_resource_len(nrdev->dev, AHCI_PCI_BAR_STANDARD) < SZ_512K || + !(readl(mmio + AHCI_VSCAP) & 1)) + return -ENODEV; + + cap = readq(mmio + AHCI_REMAP_CAP); + for (i = AHCI_MAX_REMAP-1; i >= 0; i--) { + struct resource *remapped_mem; + + if ((cap & (1 << i)) == 0) + continue; + if (readl(mmio + ahci_remap_dcc(i)) + != PCI_CLASS_STORAGE_EXPRESS) + continue; + + /* We've found a remapped device */ + remapped_mem = &nrdev->remapped_dev_mem[count++]; + remapped_mem->start = + pci_resource_start(nrdev->dev, AHCI_PCI_BAR_STANDARD) + + ahci_remap_base(i); + remapped_mem->end = remapped_mem->start + + AHCI_REMAP_N_SIZE - 1; + remapped_mem->flags = IORESOURCE_MEM | IORESOURCE_PCI_FIXED; + pci_add_resource(resources, remapped_mem); + } + + pcim_iounmap(nrdev->dev, mmio); + + if (count == 0) + return -ENODEV; + + nrdev->num_remapped_devices = count; + dev_info(&nrdev->dev->dev, "Found %d remapped NVMe devices\n", + nrdev->num_remapped_devices); + return 0; +} + +static void nvme_remap_remove_root_bus(void *data) +{ + struct pci_bus *bus = data; + + pci_stop_root_bus(bus); + pci_remove_root_bus(bus); +} + +static int nvme_remap_probe(struct pci_dev *dev, + const struct pci_device_id *id) +{ + struct nvme_remap_dev *nrdev; + LIST_HEAD(resources); + int i; + int ret; + struct pci_dev *child; + + nrdev = devm_kzalloc(&dev->dev, sizeof(*nrdev), GFP_KERNEL); + nrdev->sysdata.domain = find_free_domain(); + nrdev->sysdata.nvme_remap_dev = dev; + nrdev->dev = dev; + pci_set_drvdata(dev, nrdev); + + ret = pcim_enable_device(dev); + if (ret < 0) + return ret; + + pci_set_master(dev); + + ret = find_remapped_devices(nrdev, &resources); + if (ret) + return ret; + + /* Add resources from the original AHCI device */ + for (i = 0; i < PCI_NUM_RESOURCES; i++) { + struct resource *res = &dev->resource[i]; + + if (res->start) { + struct resource *nr_res = &nrdev->ahci_resources[i]; + + nr_res->start = res->start; + nr_res->end = res->end; + nr_res->flags = res->flags; + pci_add_resource(&resources, nr_res); + } + } + + /* Create virtual interrupts */ + nrdev->irq_base = devm_irq_alloc_descs(&dev->dev, -1, 0, + nrdev->num_remapped_devices + 1, + 0); + if (nrdev->irq_base < 0) + return nrdev->irq_base; + + /* Create and populate PCI bus */ + nrdev->bus = pci_create_root_bus(&dev->dev, 0, &nvme_remap_pci_ops, + &nrdev->sysdata, &resources); + if (!nrdev->bus) + return -ENODEV; + + if (devm_add_action_or_reset(&dev->dev, nvme_remap_remove_root_bus, + nrdev->bus)) + return -ENOMEM; + + /* We don't support sharing MSI interrupts between these devices */ + nrdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_MSI; + + pci_scan_child_bus(nrdev->bus); + + list_for_each_entry(child, &nrdev->bus->devices, bus_list) { + /* + * Prevent PCI core from trying to move memory BARs around. + * The hidden NVMe devices are at fixed locations. + */ + for (i = 0; i < PCI_NUM_RESOURCES; i++) { + struct resource *res = &child->resource[i]; + + if (res->flags & IORESOURCE_MEM) + res->flags |= IORESOURCE_PCI_FIXED; + } + + /* Share the legacy IRQ between all devices */ + child->irq = dev->irq; + } + + pci_assign_unassigned_bus_resources(nrdev->bus); + pci_bus_add_devices(nrdev->bus); + + return 0; +} + +static const struct pci_device_id nvme_remap_ids[] = { + /* + * Match all Intel RAID controllers. + * + * There's overlap here with the set of devices detected by the ahci + * driver, but ahci will only successfully probe when there + * *aren't* any remapped NVMe devices, and this driver will only + * successfully probe when there *are* remapped NVMe devices that + * need handling. + */ + { + PCI_VDEVICE(INTEL, PCI_ANY_ID), + .class = PCI_CLASS_STORAGE_RAID << 8, + .class_mask = 0xffffff00, + }, + {0,} +}; +MODULE_DEVICE_TABLE(pci, nvme_remap_ids); + +static struct pci_driver nvme_remap_drv = { + .name = MODULE_NAME, + .id_table = nvme_remap_ids, + .probe = nvme_remap_probe, +}; +module_pci_driver(nvme_remap_drv); + +MODULE_AUTHOR("Daniel Drake "); +MODULE_LICENSE("GPL v2"); diff --git a/fs/proc/base.c b/fs/proc/base.c index 6299878e3d97e6..8558fb1c26c0a6 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3357,6 +3357,9 @@ static const struct pid_entry tgid_base_stuff[] = { REG("smaps", S_IRUGO, proc_pid_smaps_operations), REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), REG("pagemap", S_IRUSR, proc_pagemap_operations), +#ifdef CONFIG_MEM_SOFT_DIRTY + REG("pagemap_reset", S_IRUSR, proc_pagemap_reset_operations), +#endif #endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), diff --git a/fs/proc/internal.h b/fs/proc/internal.h index d1598576506c1e..5898cd154fd252 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -406,6 +406,7 @@ extern const struct file_operations proc_pid_smaps_operations; extern const struct file_operations proc_pid_smaps_rollup_operations; extern const struct file_operations proc_clear_refs_operations; extern const struct file_operations proc_pagemap_operations; +extern const struct file_operations proc_pagemap_reset_operations; extern unsigned long task_vsize(struct mm_struct *); extern unsigned long task_statm(struct mm_struct *, diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index fc35a0543f0191..8e0afdd680fcf3 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1580,6 +1580,8 @@ enum clear_refs_types { struct clear_refs_private { enum clear_refs_types type; + unsigned long start, end; + bool clear_range; }; #ifdef CONFIG_MEM_SOFT_DIRTY @@ -1600,7 +1602,7 @@ static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, return folio_maybe_dma_pinned(folio); } -static inline void clear_soft_dirty(struct vm_area_struct *vma, +static inline bool clear_soft_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *pte) { /* @@ -1610,37 +1612,46 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, * of how soft-dirty works. */ pte_t ptent = ptep_get(pte); + bool ret = false; if (pte_present(ptent)) { pte_t old_pte; if (pte_is_pinned(vma, addr, ptent)) - return; + return ret; old_pte = ptep_modify_prot_start(vma, addr, pte); + ret = pte_soft_dirty(old_pte); ptent = pte_wrprotect(old_pte); ptent = pte_clear_soft_dirty(ptent); ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); } else if (is_swap_pte(ptent)) { + ret = pte_swp_soft_dirty(ptent); ptent = pte_swp_clear_soft_dirty(ptent); set_pte_at(vma->vm_mm, addr, pte, ptent); } + return ret; } #else -static inline void clear_soft_dirty(struct vm_area_struct *vma, +static inline bool clear_soft_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *pte) { + return false; } #endif #if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE) -static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, +static inline bool clear_soft_dirty_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { pmd_t old, pmd = *pmdp; + bool ret = false; if (pmd_present(pmd)) { /* See comment in change_huge_pmd() */ old = pmdp_invalidate(vma, addr, pmdp); + + ret = pmd_soft_dirty(old); + if (pmd_dirty(old)) pmd = pmd_mkdirty(pmd); if (pmd_young(old)) @@ -1651,14 +1662,17 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, set_pmd_at(vma->vm_mm, addr, pmdp, pmd); } else if (is_migration_entry(pmd_to_swp_entry(pmd))) { + ret = pmd_swp_soft_dirty(pmd); pmd = pmd_swp_clear_soft_dirty(pmd); set_pmd_at(vma->vm_mm, addr, pmdp, pmd); } + return ret; } #else -static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, +static inline bool clear_soft_dirty_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { + return false; } #endif @@ -1671,6 +1685,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, spinlock_t *ptl; struct folio *folio; + BUG_ON(addr < cp->start || end > cp->end); + ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { if (cp->type == CLEAR_REFS_SOFT_DIRTY) { @@ -1728,9 +1744,11 @@ static int clear_refs_test_walk(unsigned long start, unsigned long end, struct clear_refs_private *cp = walk->private; struct vm_area_struct *vma = walk->vma; - if (vma->vm_flags & VM_PFNMAP) + if (!cp->clear_range && (vma->vm_flags & VM_PFNMAP)) return 1; + BUG_ON(start < cp->start || end > cp->end); + /* * Writing 1 to /proc/pid/clear_refs affects all pages. * Writing 2 to /proc/pid/clear_refs only affects anonymous pages. @@ -1754,10 +1772,12 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct task_struct *task; - char buffer[PROC_NUMBUF] = {}; + char buffer[18] = {}; struct mm_struct *mm; struct vm_area_struct *vma; enum clear_refs_types type; + unsigned long start, end; + bool clear_range; int itype; int rv; @@ -1765,12 +1785,34 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) return -EFAULT; - rv = kstrtoint(strstrip(buffer), 10, &itype); - if (rv < 0) - return rv; - type = (enum clear_refs_types)itype; - if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) - return -EINVAL; + + if (buffer[0] == '6') + { + static int once; + + if (!once++) + printk(KERN_DEBUG "task_mmu: Using POC clear refs range implementation.\n"); + + if (count != 17) + return -EINVAL; + + type = CLEAR_REFS_SOFT_DIRTY; + start = *(unsigned long *)(buffer + 1); + end = *(unsigned long *)(buffer + 1 + 8); + } + else + { + rv = kstrtoint(strstrip(buffer), 10, &itype); + if (rv < 0) + return rv; + type = (enum clear_refs_types)itype; + + if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) + return -EINVAL; + + start = 0; + end = -1UL; + } task = get_proc_task(file_inode(file)); if (!task) @@ -1783,40 +1825,86 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, .type = type, }; - if (mmap_write_lock_killable(mm)) { - count = -EINTR; - goto out_mm; + if (start || end != -1UL) + { + start = min(start, -1UL) & PAGE_MASK; + end = min(end, -1UL) & PAGE_MASK; + + if (start >= end) + { + count = -EINVAL; + goto out_mm; + } + clear_range = true; } + else + { + clear_range = false; + } + + cp.start = start; + cp.end = end; + cp.clear_range = clear_range; + if (type == CLEAR_REFS_MM_HIWATER_RSS) { + if (mmap_write_lock_killable(mm)) { + count = -EINTR; + goto out_mm; + } + /* * Writing 5 to /proc/pid/clear_refs resets the peak * resident set size to this mm's current rss value. */ reset_mm_hiwater_rss(mm); - goto out_unlock; + mmap_write_unlock(mm); + goto out_mm; } if (type == CLEAR_REFS_SOFT_DIRTY) { - for_each_vma(vmi, vma) { - if (!(vma->vm_flags & VM_SOFTDIRTY)) - continue; - vm_flags_clear(vma, VM_SOFTDIRTY); - vma_set_page_prot(vma); + if (mmap_read_lock_killable(mm)) { + count = -EINTR; + goto out_mm; } - + if (!clear_range) + for_each_vma(vmi, vma) { + if (!(vma->vm_flags & VM_SOFTDIRTY)) + continue; + mmap_read_unlock(mm); + if (mmap_write_lock_killable(mm)) { + count = -EINTR; + goto out_mm; + } + for_each_vma(vmi, vma) { + vm_flags_clear(vma, VM_SOFTDIRTY); + vma_set_page_prot(vma); + } + mmap_write_downgrade(mm); + break; + } inc_tlb_flush_pending(mm); mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY, - 0, mm, 0, -1UL); + 0, mm, start, end); mmu_notifier_invalidate_range_start(&range); } - walk_page_range(mm, 0, -1, &clear_refs_walk_ops, &cp); + else + { + if (mmap_write_lock_killable(mm)) { + count = -EINTR; + goto out_mm; + } + } + walk_page_range(mm, start, end == -1UL ? -1 : end, &clear_refs_walk_ops, &cp); if (type == CLEAR_REFS_SOFT_DIRTY) { mmu_notifier_invalidate_range_end(&range); flush_tlb_mm(mm); dec_tlb_flush_pending(mm); + mmap_read_unlock(mm); + } + else + { + mmap_write_unlock(mm); } -out_unlock: - mmap_write_unlock(mm); out_mm: mmput(mm); } @@ -1838,6 +1926,7 @@ struct pagemapread { int pos, len; /* units: PM_ENTRY_BYTES, not bytes */ pagemap_entry_t *buffer; bool show_pfn; + bool reset; }; #define PAGEMAP_WALK_SIZE (PMD_SIZE) @@ -1848,6 +1937,7 @@ struct pagemapread { #define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0) #define PM_SOFT_DIRTY BIT_ULL(55) #define PM_MMAP_EXCLUSIVE BIT_ULL(56) +#define PM_SOFT_DIRTY_PAGE BIT_ULL(57) #define PM_UFFD_WP BIT_ULL(57) #define PM_GUARD_REGION BIT_ULL(58) #define PM_FILE BIT_ULL(61) @@ -1869,6 +1959,14 @@ static int add_to_pagemap(pagemap_entry_t *pme, struct pagemapread *pm) return 0; } +static int add_addr_to_pagemap(unsigned long addr, struct pagemapread *pm) +{ + ((unsigned long *)pm->buffer)[pm->pos++] = addr; + if (pm->pos >= pm->len) + return PM_END_OF_BUFFER; + return 0; +} + static bool __folio_page_mapped_exclusively(struct folio *folio, struct page *page) { if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) @@ -1883,6 +1981,9 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, unsigned long addr = start; int err = 0; + if (pm->reset) + goto out; + while (addr < end) { struct vm_area_struct *vma = find_vma(walk->mm, addr); pagemap_entry_t pme = make_pme(0, 0); @@ -1929,13 +2030,13 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, flags |= PM_PRESENT; page = vm_normal_page(vma, addr, pte); if (pte_soft_dirty(pte)) - flags |= PM_SOFT_DIRTY; + flags |= PM_SOFT_DIRTY | PM_SOFT_DIRTY_PAGE; if (pte_uffd_wp(pte)) flags |= PM_UFFD_WP; } else if (is_swap_pte(pte)) { swp_entry_t entry; if (pte_swp_soft_dirty(pte)) - flags |= PM_SOFT_DIRTY; + flags |= PM_SOFT_DIRTY | PM_SOFT_DIRTY_PAGE; if (pte_swp_uffd_wp(pte)) flags |= PM_UFFD_WP; entry = pte_to_swp_entry(pte); @@ -1993,6 +2094,20 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, struct page *page = NULL; struct folio *folio = NULL; + if (pm->reset) + { + if (clear_soft_dirty_pmd(vma, addr, pmdp)) + { + for (; addr != end; addr += PAGE_SIZE) + { + err = add_addr_to_pagemap(addr, pm); + if (err) + break; + } + } + goto trans_huge_done; + } + if (vma->vm_flags & VM_SOFTDIRTY) flags |= PM_SOFT_DIRTY; @@ -2001,7 +2116,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, flags |= PM_PRESENT; if (pmd_soft_dirty(pmd)) - flags |= PM_SOFT_DIRTY; + flags |= PM_SOFT_DIRTY | PM_SOFT_DIRTY_PAGE; if (pmd_uffd_wp(pmd)) flags |= PM_UFFD_WP; if (pm->show_pfn) @@ -2022,7 +2137,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, } flags |= PM_SWAP; if (pmd_swp_soft_dirty(pmd)) - flags |= PM_SOFT_DIRTY; + flags |= PM_SOFT_DIRTY | PM_SOFT_DIRTY_PAGE; if (pmd_swp_uffd_wp(pmd)) flags |= PM_UFFD_WP; VM_BUG_ON(!is_pmd_migration_entry(pmd)); @@ -2055,6 +2170,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, frame += (1 << MAX_SWAPFILES_SHIFT); } } +trans_huge_done: spin_unlock(ptl); return err; } @@ -2070,10 +2186,18 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, return err; } for (; addr < end; pte++, addr += PAGE_SIZE) { - pagemap_entry_t pme; + if (pm->reset) + { + if (clear_soft_dirty(vma, addr, pte)) + err = add_addr_to_pagemap(addr, pm); + } + else + { + pagemap_entry_t pme; - pme = pte_to_pagemap_entry(pm, vma, addr, ptep_get(pte)); - err = add_to_pagemap(&pme, pm); + pme = pte_to_pagemap_entry(pm, vma, addr, ptep_get(pte)); + err = add_to_pagemap(&pme, pm); + } if (err) break; } @@ -2177,8 +2301,8 @@ static const struct mm_walk_ops pagemap_ops = { * determine which areas of memory are actually mapped and llseek to * skip over unmapped regions. */ -static ssize_t pagemap_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) +static ssize_t do_pagemap_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos, bool reset) { struct mm_struct *mm = file->private_data; struct pagemapread pm; @@ -2187,6 +2311,8 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, unsigned long start_vaddr; unsigned long end_vaddr; int ret = 0, copied = 0; + struct mmu_notifier_range range; + size_t buffer_len; if (!mm || !mmget_not_zero(mm)) goto out; @@ -2202,19 +2328,38 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, /* do not disclose physical addresses: attack vector */ pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN); + pm.reset = reset; + + buffer_len = min(PAGEMAP_WALK_SIZE >> PAGE_SHIFT, count / PM_ENTRY_BYTES); - pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); - pm.buffer = kmalloc_array(pm.len, PM_ENTRY_BYTES, GFP_KERNEL); + pm.buffer = kmalloc_array(buffer_len, PM_ENTRY_BYTES, GFP_KERNEL); ret = -ENOMEM; if (!pm.buffer) goto out_mm; src = *ppos; svpfn = src / PM_ENTRY_BYTES; - end_vaddr = mm->task_size; + + start_vaddr = svpfn << PAGE_SHIFT; + + if (reset) + { + if (count < sizeof(end_vaddr)) + { + ret = -EINVAL; + goto out_mm; + } + if (copy_from_user(&end_vaddr, buf, sizeof(end_vaddr))) + return -EFAULT; + end_vaddr = min(end_vaddr, mm->task_size); + } + else + { + end_vaddr = mm->task_size; + start_vaddr = end_vaddr; + } /* watch out for wraparound */ - start_vaddr = end_vaddr; if (svpfn <= (ULONG_MAX >> PAGE_SHIFT)) { unsigned long end; @@ -2239,18 +2384,35 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, unsigned long end; pm.pos = 0; - end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK; + pm.len = min(buffer_len, count / PM_ENTRY_BYTES); + + end = reset ? end_vaddr : (start_vaddr + (pm.len << PAGE_SHIFT)); /* overflow ? */ if (end < start_vaddr || end > end_vaddr) end = end_vaddr; + ret = mmap_read_lock_killable(mm); if (ret) goto out_free; + + if (reset) + { + inc_tlb_flush_pending(mm); + mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY, + 0, mm, start_vaddr, end); + mmu_notifier_invalidate_range_start(&range); + } ret = walk_page_range(mm, start_vaddr, end, &pagemap_ops, &pm); + if (reset) + { + mmu_notifier_invalidate_range_end(&range); + flush_tlb_mm(mm); + dec_tlb_flush_pending(mm); + } mmap_read_unlock(mm); - start_vaddr = end; len = min(count, PM_ENTRY_BYTES * pm.pos); + BUG_ON(ret && ret != PM_END_OF_BUFFER); if (copy_to_user(buf, pm.buffer, len)) { ret = -EFAULT; goto out_free; @@ -2258,6 +2420,8 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, copied += len; buf += len; count -= len; + + start_vaddr = reset && pm.pos == pm.len ? ((unsigned long *)pm.buffer)[pm.pos - 1] + PAGE_SIZE : end; } *ppos += copied; if (!ret || ret == PM_END_OF_BUFFER) @@ -2271,6 +2435,18 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, return ret; } +static ssize_t pagemap_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return do_pagemap_read(file, buf, count, ppos, false); +} + +static ssize_t pagemap_reset_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return do_pagemap_read(file, buf, count, ppos, true); +} + static int pagemap_open(struct inode *inode, struct file *file) { struct mm_struct *mm; @@ -3065,6 +3241,14 @@ const struct file_operations proc_pagemap_operations = { .unlocked_ioctl = do_pagemap_cmd, .compat_ioctl = do_pagemap_cmd, }; + +const struct file_operations proc_pagemap_reset_operations = { + .llseek = mem_lseek, /* borrow this */ + .read = pagemap_reset_read, + .open = pagemap_open, + .release = pagemap_release, +}; + #endif /* CONFIG_PROC_PAGE_MONITOR */ #ifdef CONFIG_NUMA diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 51b6484c049345..9fae896aae83f8 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -508,6 +508,9 @@ extern int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, #ifdef CONFIG_IRQ_FORCED_THREADING # ifdef CONFIG_PREEMPT_RT # define force_irqthreads() (true) +# elif defined(CONFIG_FORCE_IRQ_THREADING) +DECLARE_STATIC_KEY_TRUE(force_irqthreads_key); +# define force_irqthreads() (static_branch_likely(&force_irqthreads_key)) # else DECLARE_STATIC_KEY_FALSE(force_irqthreads_key); # define force_irqthreads() (static_branch_unlikely(&force_irqthreads_key)) diff --git a/include/linux/mm.h b/include/linux/mm.h index 7c79b3369b82c9..819daddd3a493b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -201,7 +201,7 @@ static inline void __mm_zero_struct_page(struct page *page) * that. */ #define MAPCOUNT_ELF_CORE_MARGIN (5) -#define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) +#define DEFAULT_MAX_MAP_COUNT (INT_MAX - MAPCOUNT_ELF_CORE_MARGIN) extern int sysctl_max_map_count; diff --git a/include/linux/sched.h b/include/linux/sched.h index b469878de25c8a..5809513cf54b7f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -816,6 +816,32 @@ struct kmap_ctrl { #endif }; +#ifdef CONFIG_SCHED_BORE +#define BORE_BC_TIMESTAMP_SHIFT 16 + +struct bore_bc { + u64 timestamp: 48; + u64 penalty: 16; +}; + +struct bore_ctx { + struct bore_bc subtree; + struct bore_bc group; + u64 burst_time; + u16 prev_penalty; + u16 curr_penalty; + union { + u16 penalty; + struct { + u8 _; + u8 score; + }; + }; + bool stop_update; + bool futex_waiting; +}; +#endif /* CONFIG_SCHED_BORE */ + struct task_struct { #ifdef CONFIG_THREAD_INFO_IN_TASK /* @@ -874,6 +900,9 @@ struct task_struct { #ifdef CONFIG_SCHED_CLASS_EXT struct sched_ext_entity scx; #endif +#ifdef CONFIG_SCHED_BORE + struct bore_ctx bore; +#endif /* CONFIG_SCHED_BORE */ const struct sched_class *sched_class; #ifdef CONFIG_SCHED_CORE diff --git a/include/linux/sched/bore.h b/include/linux/sched/bore.h new file mode 100644 index 00000000000000..79dd72b9aa38ff --- /dev/null +++ b/include/linux/sched/bore.h @@ -0,0 +1,39 @@ +#ifndef _KERNEL_SCHED_BORE_H +#define _KERNEL_SCHED_BORE_H + +#include +#include +#include +#include +#include + +#define SCHED_BORE_AUTHOR "Masahito Suzuki" +#define SCHED_BORE_PROGNAME "BORE CPU Scheduler modification" + +#define SCHED_BORE_VERSION "6.5.9" + +extern u8 __read_mostly sched_bore; +extern u8 __read_mostly sched_burst_inherit_type; +extern u8 __read_mostly sched_burst_smoothness; +extern u8 __read_mostly sched_burst_penalty_offset; +extern uint __read_mostly sched_burst_penalty_scale; +extern uint __read_mostly sched_burst_cache_lifetime; + +extern u8 effective_prio_bore(struct task_struct *p); +extern void update_curr_bore(struct task_struct *p, u64 delta_exec); +extern void restart_burst_bore(struct task_struct *p); +extern void restart_burst_rescale_deadline_bore(struct task_struct *p); +extern void task_fork_bore(struct task_struct *p, struct task_struct *parent, + u64 clone_flags, u64 now); +extern void sched_init_bore(void); +extern void reset_task_bore(struct task_struct *p); + +extern int sched_bore_update_handler(const struct ctl_table *table, + int write, void __user *buffer, size_t *lenp, loff_t *ppos); +extern int sched_burst_inherit_type_update_handler(const struct ctl_table *table, + int write, void __user *buffer, size_t *lenp, loff_t *ppos); + +extern void reweight_entity( + struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight); + +#endif /* _KERNEL_SCHED_BORE_H */ diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h index 7e2744ec89336a..87126e49fc3009 100644 --- a/include/uapi/linux/futex.h +++ b/include/uapi/linux/futex.h @@ -22,6 +22,7 @@ #define FUTEX_WAIT_REQUEUE_PI 11 #define FUTEX_CMP_REQUEUE_PI 12 #define FUTEX_LOCK_PI2 13 +#define FUTEX_WAIT_MULTIPLE 31 #define FUTEX_PRIVATE_FLAG 128 #define FUTEX_CLOCK_REALTIME 256 @@ -100,6 +101,18 @@ struct futex_waitv { __u32 __reserved; }; +/** + * struct futex_wait_block - Block of futexes to be waited for + * @uaddr: User address of the futex + * @val: Futex value expected by userspace + * @bitset: Bitset for the optional bitmasked wakeup + */ +struct futex_wait_block { + __u32 __user *uaddr; + __u32 val; + __u32 bitset; +}; + /* * Support for robust futexes: the kernel cleans up held futexes at * thread exit time. diff --git a/init/Kconfig b/init/Kconfig index cab3ad28ca49e7..f93c898c20f886 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -187,6 +187,37 @@ config THREAD_INFO_IN_TASK menu "General setup" +config ZEN_INTERACTIVE + bool "Tune kernel for interactivity" + default y + help + Tunes the kernel for responsiveness at the cost of throughput and power usage. + + --- Block Layer ---------------------------------------- + + Default scheduler for SQ..: mq-deadline -> bfq + Default scheduler for MQ..: none -> kyber + + --- Virtual Memory Subsystem --------------------------- + + Background-reclaim hugepages...: no -> yes + Compact unevictable............: yes -> no + Watermark boost factor.........: 1.5 -> 0 + Swap-in readahead..............: 3 -> 0 + + --- EEVDF CPU Scheduler -------------------------------- + + Minimal granularity............: 0.7 -> 0.4 ms + Migration cost.................: 0.5 -> 0.3 ms + Bandwidth slice size...........: 5 -> 3 ms + Task rebalancing threshold.....: 32 -> 8 + + --- CPUFreq Settings ----------------------------------- + + Ondemand sampling down factor..: 1 -> 5 + Ondemand default up threshold..: 80 -> 55 + Ondemand micro up threshold....: 95 -> 60 + config BROKEN bool help @@ -1423,6 +1454,23 @@ config CHECKPOINT_RESTORE If unsure, say N here. +config SCHED_BORE + bool "Burst-Oriented Response Enhancer" + default y + help + In Desktop and Mobile computing, one might prefer interactive + tasks to keep responsive no matter what they run in the background. + + Enabling this kernel feature modifies the scheduler to discriminate + tasks by their burst time (runtime since it last went sleeping or + yielding state) and prioritize those that run less bursty. + Such tasks usually include window compositor, widgets backend, + terminal emulator, video playback, games and so on. + With a little impact to scheduling fairness, it may improve + responsiveness especially under heavy background workload. + + If unsure, say Y here. + config SCHED_AUTOGROUP bool "Automatic process group scheduling" select CGROUPS diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz index ce1435cb08b1ec..9eee2005e25f07 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz @@ -57,3 +57,20 @@ config HZ config SCHED_HRTICK def_bool HIGH_RES_TIMERS + +config MIN_BASE_SLICE_NS + int "Default value for min_base_slice_ns" + default 2000000 + help + The BORE Scheduler automatically calculates the optimal base + slice for the configured HZ using the following equation: + + base_slice_ns = + 1000000000/HZ * DIV_ROUNDUP(min_base_slice_ns, 1000000000/HZ) + + This option sets the default lower bound limit of the base slice + to prevent the loss of task throughput due to overscheduling. + + Setting this value too high can cause the system to boot with + an unnecessarily large base slice, resulting in high scheduling + latency and poor system responsiveness. diff --git a/kernel/fork.c b/kernel/fork.c index 3da0f08615a95e..a166224b5f8638 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -116,6 +116,10 @@ /* For dup_mmap(). */ #include "../mm/internal.h" +#ifdef CONFIG_SCHED_BORE +#include +#endif /* CONFIG_SCHED_BORE */ + #include #define CREATE_TRACE_POINTS @@ -2325,6 +2329,10 @@ __latent_entropy struct task_struct *copy_process( * Need tasklist lock for parent etc handling! */ write_lock_irq(&tasklist_lock); +#ifdef CONFIG_SCHED_BORE + if (likely(p->pid)) + task_fork_bore(p, current, clone_flags, p->start_time); +#endif /* CONFIG_SCHED_BORE */ /* CLONE_PARENT re-uses the old parent */ if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c index 880c9bf2f31504..fb3b617b9ed83d 100644 --- a/kernel/futex/syscalls.c +++ b/kernel/futex/syscalls.c @@ -166,6 +166,7 @@ static __always_inline bool futex_cmd_has_timeout(u32 cmd) case FUTEX_LOCK_PI2: case FUTEX_WAIT_BITSET: case FUTEX_WAIT_REQUEUE_PI: + case FUTEX_WAIT_MULTIPLE: return true; } return false; @@ -178,13 +179,79 @@ futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t) return -EINVAL; *t = timespec64_to_ktime(*ts); - if (cmd == FUTEX_WAIT) + if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE) *t = ktime_add_safe(ktime_get(), *t); else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME)) *t = timens_ktime_to_host(CLOCK_MONOTONIC, *t); return 0; } +/** + * futex_read_wait_block - Read an array of futex_wait_block from userspace + * @uaddr: Userspace address of the block + * @count: Number of blocks to be read + * + * This function creates and allocate an array of futex_q (we zero it to + * initialize the fields) and then, for each futex_wait_block element from + * userspace, fill a futex_q element with proper values. + */ +inline struct futex_vector *futex_read_wait_block(u32 __user *uaddr, u32 count) +{ + unsigned int i; + struct futex_vector *futexv; + struct futex_wait_block fwb; + struct futex_wait_block __user *entry = + (struct futex_wait_block __user *)uaddr; + + if (!count || count > FUTEX_WAITV_MAX) + return ERR_PTR(-EINVAL); + + futexv = kcalloc(count, sizeof(*futexv), GFP_KERNEL); + if (!futexv) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < count; i++) { + if (copy_from_user(&fwb, &entry[i], sizeof(fwb))) { + kfree(futexv); + return ERR_PTR(-EFAULT); + } + + futexv[i].w.flags = FUTEX_32; + futexv[i].w.val = fwb.val; + futexv[i].w.uaddr = (uintptr_t) (fwb.uaddr); + futexv[i].q = futex_q_init; + } + + return futexv; +} + +int futex_wait_multiple(struct futex_vector *vs, unsigned int count, + struct hrtimer_sleeper *to); + +int futex_opcode_31(ktime_t *abs_time, u32 __user *uaddr, int count) +{ + int ret; + struct futex_vector *vs; + struct hrtimer_sleeper *to = NULL, timeout; + + to = futex_setup_timer(abs_time, &timeout, 0, 0); + + vs = futex_read_wait_block(uaddr, count); + + if (IS_ERR(vs)) + return PTR_ERR(vs); + + ret = futex_wait_multiple(vs, count, abs_time ? to : NULL); + kfree(vs); + + if (to) { + hrtimer_cancel(&to->timer); + destroy_hrtimer_on_stack(&to->timer); + } + + return ret; +} + SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, const struct __kernel_timespec __user *, utime, u32 __user *, uaddr2, u32, val3) @@ -204,6 +271,9 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, tp = &t; } + if (cmd == FUTEX_WAIT_MULTIPLE) + return futex_opcode_31(tp, uaddr, val); + return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3); } @@ -512,6 +582,9 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, tp = &t; } + if (cmd == FUTEX_WAIT_MULTIPLE) + return futex_opcode_31(tp, uaddr, val); + return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3); } #endif /* CONFIG_COMPAT_32BIT_TIME */ diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c index e2bbe5509ec27a..6484ad583f3bf7 100644 --- a/kernel/futex/waitwake.c +++ b/kernel/futex/waitwake.c @@ -4,6 +4,9 @@ #include #include #include +#ifdef CONFIG_SCHED_BORE +#include +#endif /* CONFIG_SCHED_BORE */ #include "futex.h" @@ -355,7 +358,15 @@ void futex_do_wait(struct futex_q *q, struct hrtimer_sleeper *timeout) * is no timeout, or if it has yet to expire. */ if (!timeout || timeout->task) +#ifdef CONFIG_SCHED_BORE + { + current->bore.futex_waiting = true; +#endif /* CONFIG_SCHED_BORE */ schedule(); +#ifdef CONFIG_SCHED_BORE + current->bore.futex_waiting = false; + } +#endif /* CONFIG_SCHED_BORE */ } __set_current_state(TASK_RUNNING); } diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 1b4254d19a73ec..40d19d6826afdd 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -109,6 +109,23 @@ config GENERIC_IRQ_STAT_SNAPSHOT config IRQ_FORCED_THREADING bool +config FORCE_IRQ_THREADING + bool "Make IRQ threading compulsory" + depends on IRQ_FORCED_THREADING + default n + help + + Make IRQ threading mandatory for any IRQ handlers that support it + instead of being optional and requiring the threadirqs kernel + parameter. Instead they can be optionally disabled with the + nothreadirqs kernel parameter. + + Enabling this may make some architectures not boot with runqueue + sharing and MuQSS. + + Enable if you are building for a desktop or low latency system, + otherwise say N. + config SPARSE_IRQ bool "Support sparse irq numbering" if MAY_HAVE_SPARSE_IRQ help diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 400856abf67219..56f4ccc02dac37 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -25,7 +25,18 @@ #include "internals.h" #if defined(CONFIG_IRQ_FORCED_THREADING) && !defined(CONFIG_PREEMPT_RT) +#ifdef CONFIG_FORCE_IRQ_THREADING +DEFINE_STATIC_KEY_TRUE(force_irqthreads_key); +#else DEFINE_STATIC_KEY_FALSE(force_irqthreads_key); +#endif + +static int __init setup_noforced_irqthreads(char *arg) +{ + static_branch_disable(&force_irqthreads_key); + return 0; +} +early_param("nothreadirqs", setup_noforced_irqthreads); static int __init setup_forced_irqthreads(char *arg) { diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 8ae86371ddcddf..b688084bcecc75 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -37,3 +37,4 @@ obj-y += core.o obj-y += fair.o obj-y += build_policy.o obj-y += build_utility.o +obj-$(CONFIG_SCHED_BORE) += bore.o diff --git a/kernel/sched/bore.c b/kernel/sched/bore.c new file mode 100644 index 00000000000000..7c1baa1bdf6c56 --- /dev/null +++ b/kernel/sched/bore.c @@ -0,0 +1,387 @@ +/* + * Burst-Oriented Response Enhancer (BORE) CPU Scheduler + * Copyright (C) 2021-2025 Masahito Suzuki + */ +#include +#include +#include +#include "sched.h" + +#ifdef CONFIG_SCHED_BORE +u8 __read_mostly sched_bore = 1; +u8 __read_mostly sched_burst_inherit_type = 2; +u8 __read_mostly sched_burst_smoothness = 1; +u8 __read_mostly sched_burst_penalty_offset = 24; +uint __read_mostly sched_burst_penalty_scale = 1536; +uint __read_mostly sched_burst_cache_lifetime = 75000000; +static int __maybe_unused maxval_prio = 39; +static int __maybe_unused maxval_6_bits = 63; +static int __maybe_unused maxval_8_bits = 255; +static int __maybe_unused maxval_12_bits = 4095; + +#define MAX_BURST_PENALTY ((40U << 8) - 1) +#define BURST_CACHE_STOP_COUNT 63 + +static u32 (*inherit_penalty_fn)(struct task_struct *, u64, u64); + +static inline u32 log2p1_u64_u32fp(u64 v, u8 fp) { + if (!v) return 0; + u32 exponent = fls64(v), + mantissa = (u32)(v << (64 - exponent) << 1 >> (64 - fp)); + return exponent << fp | mantissa; +} + +static inline u32 calc_burst_penalty(u64 burst_time) { + u32 greed = log2p1_u64_u32fp(burst_time, 8), + tolerance = sched_burst_penalty_offset << 8, + penalty = max(0, (s32)(greed - tolerance)), + scaled_penalty = penalty * sched_burst_penalty_scale >> 10; + return min(MAX_BURST_PENALTY, scaled_penalty); +} + +static inline u64 rescale_slice(u64 delta, u8 old_prio, u8 new_prio) { + u64 unscaled, rescaled; + unscaled = mul_u64_u32_shr(delta , sched_prio_to_weight[old_prio], 10); + rescaled = mul_u64_u32_shr(unscaled, sched_prio_to_wmult [new_prio], 22); + return rescaled; +} + +static inline u32 binary_smooth(u32 new, u32 old) { + if (new <= old) return new; + + u32 increment = new - old, + shift = sched_burst_smoothness, + divisor = 1U << shift; + + return old + ((increment + divisor - 1) >> shift); +} + +static void reweight_task_by_prio(struct task_struct *p, int prio) { + if (task_has_idle_policy(p)) return; + + struct sched_entity *se = &p->se; + unsigned long weight = scale_load(sched_prio_to_weight[prio]); + + if (se->on_rq) { + p->bore.stop_update = true; + reweight_entity(cfs_rq_of(se), se, weight); + p->bore.stop_update = false; + } else + se->load.weight = weight; + se->load.inv_weight = sched_prio_to_wmult[prio]; +} + +u8 effective_prio_bore(struct task_struct *p) { + int prio = p->static_prio - MAX_RT_PRIO; + if (likely(sched_bore)) + prio += p->bore.score; + return (u8)clamp(prio, 0, maxval_prio); +} + +static void update_penalty(struct task_struct *p) { + struct bore_ctx *ctx = &p->bore; + + u8 prev_prio = effective_prio_bore(p); + + ctx->penalty = (p->flags & PF_KTHREAD)? 0: + max(ctx->curr_penalty, ctx->prev_penalty); + + u8 new_prio = effective_prio_bore(p); + if (new_prio != prev_prio) + reweight_task_by_prio(p, new_prio); +} + +void update_curr_bore(struct task_struct *p, u64 delta_exec) { + struct bore_ctx *ctx = &p->bore; + if (ctx->stop_update) return; + + ctx->burst_time += delta_exec; + u32 curr_penalty = ctx->curr_penalty = calc_burst_penalty(ctx->burst_time); + + if (curr_penalty <= ctx->prev_penalty) return; + update_penalty(p); +} + +void restart_burst_bore(struct task_struct *p) { + struct bore_ctx *ctx = &p->bore; + u32 new_penalty = binary_smooth(ctx->curr_penalty, ctx->prev_penalty); + ctx->prev_penalty = new_penalty; + ctx->curr_penalty = 0; + ctx->burst_time = 0; + update_penalty(p); +} + +void restart_burst_rescale_deadline_bore(struct task_struct *p) { + struct sched_entity *se = &p->se; + s64 vscaled, vremain = se->deadline - se->vruntime; + + u8 old_prio = effective_prio_bore(p); + restart_burst_bore(p); + u8 new_prio = effective_prio_bore(p); + + if (old_prio > new_prio) { + vscaled = rescale_slice(abs(vremain), old_prio, new_prio); + if (unlikely(vremain < 0)) + vscaled = -vscaled; + se->deadline = se->vruntime + vscaled; + } +} + +static inline bool task_is_bore_eligible(struct task_struct *p) +{return p && p->sched_class == &fair_sched_class && !p->exit_state;} + +#ifndef for_each_child_task +#define for_each_child_task(p, t) \ + list_for_each_entry(t, &(p)->children, sibling) +#endif + +static inline u32 count_children_upto2(struct task_struct *p) { + struct list_head *head = &p->children; + struct list_head *next = head->next; + return (next != head) + (next->next != head); +} + +static inline bool burst_cache_expired(struct bore_bc *bc, u64 now) { + u64 timestamp = bc->timestamp << BORE_BC_TIMESTAMP_SHIFT; + return now - timestamp > sched_burst_cache_lifetime; +} + +static void update_burst_cache(struct bore_bc *bc, + struct task_struct *p, u32 count, u32 total, u64 now) { + u32 average = count ? total / count : 0; + bc->penalty = max(average, p->bore.penalty); + bc->timestamp = now >> BORE_BC_TIMESTAMP_SHIFT; +} + +static u32 inherit_none(struct task_struct *parent, + u64 clone_flags, u64 now) +{ return 0; } + +static u32 inherit_from_parent(struct task_struct *parent, + u64 clone_flags, u64 now) { + if (clone_flags & CLONE_PARENT) + parent = parent->real_parent; + + struct bore_bc *bc = &parent->bore.subtree; + + if (burst_cache_expired(bc, now)) { + struct task_struct *child; + u32 count = 0, total = 0; + for_each_child_task(parent, child) { + if (count >= BURST_CACHE_STOP_COUNT) break; + + if (!task_is_bore_eligible(child)) continue; + count++; + total += child->bore.penalty; + } + + update_burst_cache(bc, parent, count, total, now); + } + + return bc->penalty; +} + +static u32 inherit_from_ancestor_hub(struct task_struct *parent, + u64 clone_flags, u64 now) { + struct task_struct *ancestor = parent; + u32 sole_child_count = 0; + + if (clone_flags & CLONE_PARENT) { + ancestor = ancestor->real_parent; + sole_child_count = 1; + } + + for (struct task_struct *next; + (next = ancestor->real_parent) != ancestor && + count_children_upto2(ancestor) <= sole_child_count; + ancestor = next, sole_child_count = 1) {} + + struct bore_bc *bc = &ancestor->bore.subtree; + + if (burst_cache_expired(bc, now)) { + struct task_struct *direct_child; + u32 count = 0, total = 0; + for_each_child_task(ancestor, direct_child) { + if (count >= BURST_CACHE_STOP_COUNT) break; + + struct task_struct *descendant = direct_child; + while (count_children_upto2(descendant) == 1) + descendant = list_first_entry(&descendant->children, + struct task_struct, sibling); + + if (!task_is_bore_eligible(descendant)) continue; + count++; + total += descendant->bore.penalty; + } + + update_burst_cache(bc, ancestor, count, total, now); + } + + return bc->penalty; +} + +static u32 inherit_from_thread_group(struct task_struct *p, u64 now) { + struct task_struct *leader = p->group_leader; + struct bore_bc *bc = &leader->bore.group; + + if (burst_cache_expired(bc, now)) { + struct task_struct *sibling; + u32 count = 0, total = 0; + + for_each_thread(leader, sibling) { + if (count >= BURST_CACHE_STOP_COUNT) break; + + if (!task_is_bore_eligible(sibling)) continue; + count++; + total += sibling->bore.penalty; + } + + update_burst_cache(bc, leader, count, total, now); + } + + return bc->penalty; +} + +void task_fork_bore(struct task_struct *p, + struct task_struct *parent, u64 clone_flags, u64 now) { + if (!task_is_bore_eligible(p) || unlikely(!sched_bore)) return; + + struct bore_ctx *ctx = &p->bore; + u32 inherited_penalty = (clone_flags & CLONE_THREAD)? + inherit_from_thread_group(parent, now): + inherit_penalty_fn(parent, clone_flags, now); + + if (ctx->prev_penalty < inherited_penalty) + ctx->prev_penalty = inherited_penalty; + ctx->curr_penalty = 0; + ctx->burst_time = 0; + ctx->stop_update = false; + ctx->futex_waiting = false; + update_penalty(p); +} + +void reset_task_bore(struct task_struct *p) +{ memset(&p->bore, 0, sizeof(struct bore_ctx)); } + +static void update_inherit_type(void) { + switch(sched_burst_inherit_type) { + case 1: + inherit_penalty_fn = inherit_from_parent; + break; + case 2: + inherit_penalty_fn = inherit_from_ancestor_hub; + break; + default: + inherit_penalty_fn = inherit_none; + } +} + +void __init sched_init_bore(void) { + printk(KERN_INFO "%s %s by %s\n", + SCHED_BORE_PROGNAME, SCHED_BORE_VERSION, SCHED_BORE_AUTHOR); + + reset_task_bore(&init_task); + update_inherit_type(); +} + +static void readjust_all_task_weights(void) { + struct task_struct *task; + struct rq *rq; + struct rq_flags rf; + + scoped_guard(write_lock_irq, &tasklist_lock) + for_each_process(task) { + if (!task_is_bore_eligible(task)) continue; + rq = task_rq_lock(task, &rf); + update_rq_clock(rq); + reweight_task_by_prio(task, effective_prio_bore(task)); + task_rq_unlock(rq, task, &rf); + } +} + +int sched_bore_update_handler(const struct ctl_table *table, + int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + int ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); + if (ret || !write) + return ret; + + readjust_all_task_weights(); + + return 0; +} + +int sched_burst_inherit_type_update_handler(const struct ctl_table *table, + int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + int ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); + if (ret || !write) + return ret; + + update_inherit_type(); + + return 0; +} + +#ifdef CONFIG_SYSCTL +static struct ctl_table sched_bore_sysctls[] = { + { + .procname = "sched_bore", + .data = &sched_bore, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = sched_bore_update_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "sched_burst_inherit_type", + .data = &sched_burst_inherit_type, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = sched_burst_inherit_type_update_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + { + .procname = "sched_burst_smoothness", + .data = &sched_burst_smoothness, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_THREE, + }, + { + .procname = "sched_burst_penalty_offset", + .data = &sched_burst_penalty_offset, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &maxval_6_bits, + }, + { + .procname = "sched_burst_penalty_scale", + .data = &sched_burst_penalty_scale, + .maxlen = sizeof(uint), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &maxval_12_bits, + }, + { + .procname = "sched_burst_cache_lifetime", + .data = &sched_burst_cache_lifetime, + .maxlen = sizeof(uint), + .mode = 0644, + .proc_handler = proc_douintvec, + }, +}; + +static int __init sched_bore_sysctl_init(void) { + register_sysctl_init("kernel", sched_bore_sysctls); + return 0; +} +late_initcall(sched_bore_sysctl_init); + +#endif // CONFIG_SYSCTL +#endif /* CONFIG_SCHED_BORE */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f754a60de84849..c5a6f727fef143 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -100,6 +100,10 @@ #include "../smpboot.h" #include "../locking/mutex.h" +#ifdef CONFIG_SCHED_BORE +#include +#endif /* CONFIG_SCHED_BORE */ + EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpu); EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpumask); @@ -1431,7 +1435,11 @@ int tg_nop(struct task_group *tg, void *data) void set_load_weight(struct task_struct *p, bool update_load) { +#ifdef CONFIG_SCHED_BORE + int prio = effective_prio_bore(p); +#else /* !CONFIG_SCHED_BORE */ int prio = p->static_prio - MAX_RT_PRIO; +#endif /* CONFIG_SCHED_BORE */ struct load_weight lw; if (task_has_idle_policy(p)) { @@ -8655,6 +8663,10 @@ void __init sched_init(void) BUG_ON(!sched_class_above(&ext_sched_class, &idle_sched_class)); #endif +#ifdef CONFIG_SCHED_BORE + sched_init_bore(); +#endif /* CONFIG_SCHED_BORE */ + wait_bit_init(); #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 02e16b70a7901e..751df396d94ba6 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -169,6 +169,53 @@ static const struct file_operations sched_feat_fops = { .release = single_release, }; +#ifdef CONFIG_SCHED_BORE +#define DEFINE_SYSCTL_SCHED_FUNC(name, update_func) \ +static ssize_t sched_##name##_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) \ +{ \ + char buf[16]; \ + unsigned int value; \ +\ + if (cnt > 15) \ + cnt = 15; \ +\ + if (copy_from_user(&buf, ubuf, cnt)) \ + return -EFAULT; \ + buf[cnt] = '\0'; \ +\ + if (kstrtouint(buf, 10, &value)) \ + return -EINVAL; \ +\ + sysctl_sched_##name = value; \ + sched_update_##update_func(); \ +\ + *ppos += cnt; \ + return cnt; \ +} \ +\ +static int sched_##name##_show(struct seq_file *m, void *v) \ +{ \ + seq_printf(m, "%d\n", sysctl_sched_##name); \ + return 0; \ +} \ +\ +static int sched_##name##_open(struct inode *inode, struct file *filp) \ +{ \ + return single_open(filp, sched_##name##_show, NULL); \ +} \ +\ +static const struct file_operations sched_##name##_fops = { \ + .open = sched_##name##_open, \ + .write = sched_##name##_write, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = single_release, \ +}; + +DEFINE_SYSCTL_SCHED_FUNC(min_base_slice, min_base_slice) + +#undef DEFINE_SYSCTL_SCHED_FUNC +#else /* !CONFIG_SCHED_BORE */ static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { @@ -214,6 +261,7 @@ static const struct file_operations sched_scaling_fops = { .llseek = seq_lseek, .release = single_release, }; +#endif /* CONFIG_SCHED_BORE */ #ifdef CONFIG_PREEMPT_DYNAMIC @@ -500,12 +548,19 @@ static __init int sched_init_debug(void) debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops); #endif +#ifdef CONFIG_SCHED_BORE + debugfs_create_file("min_base_slice_ns", 0644, debugfs_sched, NULL, &sched_min_base_slice_fops); + debugfs_create_u32("base_slice_ns", 0444, debugfs_sched, &sysctl_sched_base_slice); +#else /* !CONFIG_SCHED_BORE */ debugfs_create_u32("base_slice_ns", 0644, debugfs_sched, &sysctl_sched_base_slice); +#endif /* CONFIG_SCHED_BORE */ debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms); debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once); +#if !defined(CONFIG_SCHED_BORE) debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops); +#endif /* CONFIG_SCHED_BORE */ debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost); debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate); @@ -747,6 +802,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)), SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime))); +#ifdef CONFIG_SCHED_BORE + SEQ_printf(m, " %2d", p->bore.score); +#endif /* CONFIG_SCHED_BORE */ #ifdef CONFIG_NUMA_BALANCING SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); #endif @@ -1224,6 +1282,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, __PS("nr_involuntary_switches", p->nivcsw); P(se.load.weight); +#ifdef CONFIG_SCHED_BORE + P(bore.score); +#endif /* CONFIG_SCHED_BORE */ P(se.avg.load_sum); P(se.avg.runnable_sum); P(se.avg.util_sum); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5b752324270b08..66526308dcaf98 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -58,6 +58,10 @@ #include "stats.h" #include "autogroup.h" +#ifdef CONFIG_SCHED_BORE +#include +#endif /* CONFIG_SCHED_BORE */ + /* * The initial- and re-scaling of tunables is configurable * @@ -67,20 +71,42 @@ * SCHED_TUNABLESCALING_LOG - scaled logarithmically, *1+ilog(ncpus) * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus * - * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) + * BORE : default SCHED_TUNABLESCALING_NONE = *1 constant + * EEVDF: default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) */ +#ifdef CONFIG_SCHED_BORE +unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; +#else /* !CONFIG_SCHED_BORE */ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; +#endif /* CONFIG_SCHED_BORE */ /* * Minimal preemption granularity for CPU-bound tasks: * + * BORE : base_slice = minimum multiple of nsecs_per_tick >= min_base_slice * (default: 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds) */ +#ifdef CONFIG_SCHED_BORE + +static const unsigned int nsecs_per_tick = 1000000000ULL / HZ; +unsigned int sysctl_sched_min_base_slice = CONFIG_MIN_BASE_SLICE_NS; +__read_mostly uint sysctl_sched_base_slice = nsecs_per_tick; +__read_mostly unsigned int sysctl_sched_migration_cost = 500000UL; + +#elifdef CONFIG_ZEN_INTERACTIVE + +unsigned int sysctl_sched_base_slice = 400000ULL; +static unsigned int normalized_sysctl_sched_base_slice = 400000ULL; +__read_mostly unsigned int sysctl_sched_migration_cost = 300000UL; + +#else + unsigned int sysctl_sched_base_slice = 700000ULL; static unsigned int normalized_sysctl_sched_base_slice = 700000ULL; - __read_mostly unsigned int sysctl_sched_migration_cost = 500000UL; +#endif /* CONFIG_SCHED_BORE */ + static int __init setup_sched_thermal_decay_shift(char *str) { pr_warn("Ignoring the deprecated sched_thermal_decay_shift= option\n"); @@ -122,8 +148,12 @@ int __weak arch_asym_cpu_priority(int cpu) * * (default: 5 msec, units: microseconds) */ +#ifdef CONFIG_ZEN_INTERACTIVE +static unsigned int sysctl_sched_cfs_bandwidth_slice = 3000UL; +#else static unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; #endif +#endif #ifdef CONFIG_NUMA_BALANCING /* Restrict the NUMA promotion throughput (MB/s) for each target node. */ @@ -189,6 +219,13 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w) * * This idea comes from the SD scheduler of Con Kolivas: */ +#ifdef CONFIG_SCHED_BORE +static void update_sysctl(void) { + sysctl_sched_base_slice = nsecs_per_tick * + max(1UL, DIV_ROUND_UP(sysctl_sched_min_base_slice, nsecs_per_tick)); +} +void sched_update_min_base_slice(void) { update_sysctl(); } +#else /* !CONFIG_SCHED_BORE */ static unsigned int get_update_sysctl_factor(void) { unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8); @@ -219,6 +256,7 @@ static void update_sysctl(void) SET_SYSCTL(sched_base_slice); #undef SET_SYSCTL } +#endif /* CONFIG_SCHED_BORE */ void __init sched_init_granularity(void) { @@ -698,6 +736,9 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) vlag = avg_vruntime(cfs_rq) - se->vruntime; limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se); +#ifdef CONFIG_SCHED_BORE + limit >>= !!sched_bore; +#endif /* CONFIG_SCHED_BORE */ se->vlag = clamp(vlag, -limit, limit); } @@ -891,10 +932,17 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) */ static inline void set_protect_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { +#ifdef CONFIG_SCHED_BORE + u64 slice = sysctl_sched_base_slice; + bool run_to_parity = likely(sched_bore) ? + sched_feat(RUN_TO_PARITY_BORE) : sched_feat(RUN_TO_PARITY); +#else /* CONFIG_SCHED_BORE */ u64 slice = normalized_sysctl_sched_base_slice; + bool run_to_parity = sched_feat(RUN_TO_PARITY); +#endif /* CONFIG_SCHED_BORE */ u64 vprot = se->deadline; - if (sched_feat(RUN_TO_PARITY)) + if (run_to_parity) slice = cfs_rq_min_slice(cfs_rq); slice = min(slice, se->slice); @@ -959,6 +1007,11 @@ static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect) curr = NULL; if (curr && protect && protect_slice(curr)) +#ifdef CONFIG_SCHED_BORE + if (!entity_is_task(curr) || + !task_of(curr)->bore.futex_waiting || + unlikely(!sched_bore)) +#endif /* CONFIG_SCHED_BORE */ return curr; /* Pick the leftmost entity if it's eligible */ @@ -1020,6 +1073,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) /************************************************************** * Scheduling class statistics methods: */ +#if !defined(CONFIG_SCHED_BORE) int sched_update_scaling(void) { unsigned int factor = get_update_sysctl_factor(); @@ -1031,6 +1085,7 @@ int sched_update_scaling(void) return 0; } +#endif /* CONFIG_SCHED_BORE */ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se); @@ -1229,6 +1284,11 @@ static void update_curr(struct cfs_rq *cfs_rq) update_min_vruntime(cfs_rq); if (entity_is_task(curr)) { +#ifdef CONFIG_SCHED_BORE + struct task_struct *p = task_of(curr); + update_curr_bore(p, delta_exec); +#endif /* CONFIG_SCHED_BORE */ + /* * If the fair_server is active, we need to account for the * fair_server time whether or not the task is running on @@ -3767,7 +3827,7 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags); -static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, +void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) { bool curr = cfs_rq->curr == se; @@ -5124,12 +5184,11 @@ void __setparam_fair(struct task_struct *p, const struct sched_attr *attr) static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { - u64 vslice, vruntime = avg_vruntime(cfs_rq); + u64 vslice = 0, vruntime = avg_vruntime(cfs_rq); s64 lag = 0; if (!se->custom_slice) se->slice = sysctl_sched_base_slice; - vslice = calc_delta_fair(se->slice, se); /* * Due to how V is constructed as the weighted average of entities, @@ -5214,7 +5273,18 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) se->rel_deadline = 0; return; } - +#ifdef CONFIG_SCHED_BORE + if (entity_is_task(se) && + likely(sched_bore) && + task_of(se)->bore.futex_waiting) + goto vslice_found; +#endif /* !CONFIG_SCHED_BORE */ + vslice = calc_delta_fair(se->slice, se); +#ifdef CONFIG_SCHED_BORE + if (likely(sched_bore)) + vslice >>= !!(flags & (ENQUEUE_INITIAL | ENQUEUE_WAKEUP)); + else +#endif /* CONFIG_SCHED_BORE */ /* * When joining the competition; the existing tasks will be, * on average, halfway through their slice, as such start tasks @@ -5223,6 +5293,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL)) vslice /= 2; +#ifdef CONFIG_SCHED_BORE +vslice_found: +#endif /* CONFIG_SCHED_BORE */ /* * EEVDF: vd_i = ve_i + r_i/w_i */ @@ -5233,7 +5306,7 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq); static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq); static void -requeue_delayed_entity(struct sched_entity *se); +requeue_delayed_entity(struct sched_entity *se, int flags); static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) @@ -5391,6 +5464,10 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (sched_feat(DELAY_DEQUEUE) && delay && !entity_eligible(cfs_rq, se)) { update_load_avg(cfs_rq, se, 0); +#ifdef CONFIG_SCHED_BORE + if (sched_feat(DELAY_ZERO) && likely(sched_bore)) + update_entity_lag(cfs_rq, se); +#endif /* CONFIG_SCHED_BORE */ set_delayed(se); return false; } @@ -6879,7 +6956,7 @@ static int sched_idle_cpu(int cpu) } static void -requeue_delayed_entity(struct sched_entity *se) +requeue_delayed_entity(struct sched_entity *se, int flags) { struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -6892,13 +6969,22 @@ requeue_delayed_entity(struct sched_entity *se) WARN_ON_ONCE(!se->on_rq); if (sched_feat(DELAY_ZERO)) { +#ifdef CONFIG_SCHED_BORE + if (likely(sched_bore)) + flags |= ENQUEUE_WAKEUP; + else { +#endif /* CONFIG_SCHED_BORE */ + flags = 0; update_entity_lag(cfs_rq, se); +#ifdef CONFIG_SCHED_BORE + } +#endif /* CONFIG_SCHED_BORE */ if (se->vlag > 0) { cfs_rq->nr_queued--; if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); se->vlag = 0; - place_entity(cfs_rq, se, 0); + place_entity(cfs_rq, se, flags); if (se != cfs_rq->curr) __enqueue_entity(cfs_rq, se); cfs_rq->nr_queued++; @@ -6938,7 +7024,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) util_est_enqueue(&rq->cfs, p); if (flags & ENQUEUE_DELAYED) { - requeue_delayed_entity(se); + requeue_delayed_entity(se, flags); return; } @@ -6956,7 +7042,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { if (se->on_rq) { if (se->sched_delayed) - requeue_delayed_entity(se); + requeue_delayed_entity(se, flags); break; } cfs_rq = cfs_rq_of(se); @@ -7169,6 +7255,15 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) util_est_dequeue(&rq->cfs, p); util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP); +#ifdef CONFIG_SCHED_BORE + struct cfs_rq *cfs_rq = &rq->cfs; + struct sched_entity *se = &p->se; + if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) { + if (cfs_rq->curr == se) + update_curr(cfs_rq_of(&p->se)); + restart_burst_bore(p); + } +#endif /* CONFIG_SCHED_BORE */ if (dequeue_entities(rq, &p->se, flags) < 0) return false; @@ -7536,9 +7631,14 @@ static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct tas return new_cpu; } +static inline bool is_idle_cpu(int cpu) +{ + return available_idle_cpu(cpu) || sched_idle_cpu(cpu); +} + static inline int __select_idle_cpu(int cpu, struct task_struct *p) { - if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) && + if (is_idle_cpu(cpu) && sched_cpu_cookie_match(cpu_rq(cpu), p)) return cpu; @@ -7549,6 +7649,24 @@ static inline int __select_idle_cpu(int cpu, struct task_struct *p) DEFINE_STATIC_KEY_FALSE(sched_smt_present); EXPORT_SYMBOL_GPL(sched_smt_present); +/* + * Return true if all the CPUs in the SMT core where @cpu belongs are idle, + * false otherwise. + */ +static bool is_idle_core(int cpu) +{ + int sibling; + + if (!sched_smt_active()) + return is_idle_cpu(cpu); + + for_each_cpu(sibling, cpu_smt_mask(cpu)) + if (!is_idle_cpu(sibling)) + return false; + + return true; +} + static inline void set_idle_cores(int cpu, int val) { struct sched_domain_shared *sds; @@ -7631,29 +7749,6 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu return -1; } -/* - * Scan the local SMT mask for idle CPUs. - */ -static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) -{ - int cpu; - - for_each_cpu_and(cpu, cpu_smt_mask(target), p->cpus_ptr) { - if (cpu == target) - continue; - /* - * Check if the CPU is in the LLC scheduling domain of @target. - * Due to isolcpus, there is no guarantee that all the siblings are in the domain. - */ - if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) - continue; - if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) - return cpu; - } - - return -1; -} - #else /* !CONFIG_SCHED_SMT: */ static inline void set_idle_cores(int cpu, int val) @@ -7670,9 +7765,9 @@ static inline int select_idle_core(struct task_struct *p, int core, struct cpuma return __select_idle_cpu(core, p); } -static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) +static inline bool is_idle_core(int cpu) { - return -1; + return is_idle_cpu(cpu); } #endif /* !CONFIG_SCHED_SMT */ @@ -7769,7 +7864,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) for_each_cpu_wrap(cpu, cpus, target) { unsigned long cpu_cap = capacity_of(cpu); - if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu)) + if (!is_idle_cpu(cpu)) continue; fits = util_fits_cpu(task_util, util_min, util_max, cpu); @@ -7840,7 +7935,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) */ lockdep_assert_irqs_disabled(); - if ((available_idle_cpu(target) || sched_idle_cpu(target)) && + if (is_idle_core(target) && asym_fits_cpu(task_util, util_min, util_max, target)) return target; @@ -7848,7 +7943,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) * If the previous CPU is cache affine and idle, don't be stupid: */ if (prev != target && cpus_share_cache(prev, target) && - (available_idle_cpu(prev) || sched_idle_cpu(prev)) && + is_idle_core(prev) && asym_fits_cpu(task_util, util_min, util_max, prev)) { if (!static_branch_unlikely(&sched_cluster_active) || @@ -7880,7 +7975,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if (recent_used_cpu != prev && recent_used_cpu != target && cpus_share_cache(recent_used_cpu, target) && - (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && + is_idle_core(recent_used_cpu) && cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) && asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) { @@ -7916,16 +8011,9 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if (!sd) return target; - if (sched_smt_active()) { + if (sched_smt_active()) has_idle_core = test_idle_cores(target); - if (!has_idle_core && cpus_share_cache(prev, target)) { - i = select_idle_smt(p, sd, prev); - if ((unsigned int)i < nr_cpumask_bits) - return i; - } - } - i = select_idle_cpu(p, sd, has_idle_core, target); if ((unsigned)i < nr_cpumask_bits) return i; @@ -8817,7 +8905,13 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int if (__pick_eevdf(cfs_rq, !do_preempt_short) == pse) goto preempt; +#ifdef CONFIG_SCHED_BORE + bool run_to_parity = likely(sched_bore) ? + sched_feat(RUN_TO_PARITY_BORE) : sched_feat(RUN_TO_PARITY); + if (run_to_parity && do_preempt_short) +#else /* CONFIG_SCHED_BORE */ if (sched_feat(RUN_TO_PARITY) && do_preempt_short) +#endif /* CONFIG_SCHED_BORE */ update_protect_slice(cfs_rq, se); return; @@ -8997,16 +9091,25 @@ static void yield_task_fair(struct rq *rq) /* * Are we the only task in the tree? */ +#if !defined(CONFIG_SCHED_BORE) if (unlikely(rq->nr_running == 1)) return; clear_buddies(cfs_rq, se); +#endif /* CONFIG_SCHED_BORE */ update_rq_clock(rq); /* * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); +#ifdef CONFIG_SCHED_BORE + restart_burst_rescale_deadline_bore(curr); + if (unlikely(rq->nr_running == 1)) + return; + + clear_buddies(cfs_rq, se); +#endif /* CONFIG_SCHED_BORE */ /* * Tell update_rq_clock() that we've just updated, * so we don't do microscopic update in schedule() @@ -13256,6 +13359,9 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) WARN_ON_ONCE(p->se.sched_delayed); attach_task_cfs_rq(p); +#ifdef CONFIG_SCHED_BORE + reset_task_bore(p); +#endif /* CONFIG_SCHED_BORE */ set_task_max_allowed_capacity(p); diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 3c12d9f93331d6..abadc5ca74e2dd 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -18,6 +18,9 @@ SCHED_FEAT(PLACE_REL_DEADLINE, true) * 0-lag point or until is has exhausted it's slice. */ SCHED_FEAT(RUN_TO_PARITY, true) +#ifdef CONFIG_SCHED_BORE +SCHED_FEAT(RUN_TO_PARITY_BORE, false) +#endif /* CONFIG_SCHED_BORE */ /* * Allow wakeup of tasks with a shorter slice to cancel RUN_TO_PARITY for * current. diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index adfb6e3409d729..39732d2daf80ea 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2130,7 +2130,11 @@ extern int group_balance_cpu(struct sched_group *sg); extern void update_sched_domain_debugfs(void); extern void dirty_sched_domain_sysctl(int cpu); +#ifdef CONFIG_SCHED_BORE +extern void sched_update_min_base_slice(void); +#else /* !CONFIG_SCHED_BORE */ extern int sched_update_scaling(void); +#endif /* CONFIG_SCHED_BORE */ static inline const struct cpumask *task_user_cpus(struct task_struct *p) { @@ -2800,7 +2804,7 @@ extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); -#ifdef CONFIG_PREEMPT_RT +#if defined(CONFIG_PREEMPT_RT) || defined(CONFIG_ZEN_INTERACTIVE) # define SCHED_NR_MIGRATE_BREAK 8 #else # define SCHED_NR_MIGRATE_BREAK 32 @@ -2809,7 +2813,12 @@ extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); extern __read_mostly unsigned int sysctl_sched_nr_migrate; extern __read_mostly unsigned int sysctl_sched_migration_cost; +#ifdef CONFIG_SCHED_BORE +extern unsigned int sysctl_sched_min_base_slice; +extern __read_mostly uint sysctl_sched_base_slice; +#else /* !CONFIG_SCHED_BORE */ extern unsigned int sysctl_sched_base_slice; +#endif /* CONFIG_SCHED_BORE */ extern int sysctl_resched_latency_warn_ms; extern int sysctl_resched_latency_warn_once; diff --git a/mm/Kconfig b/mm/Kconfig index ca3f146bc7053a..de643ce40c1108 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -631,7 +631,7 @@ config COMPACTION config COMPACT_UNEVICTABLE_DEFAULT int depends on COMPACTION - default 0 if PREEMPT_RT + default 0 if PREEMPT_RT || ZEN_INTERACTIVE default 1 # diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 6cba1cb14b23ac..43449e5b51aee8 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -63,7 +63,11 @@ unsigned long transparent_hugepage_flags __read_mostly = #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE (1<1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8) @@ -274,7 +276,11 @@ const char * const migratetype_names[MIGRATE_TYPES] = { int min_free_kbytes = 1024; int user_min_free_kbytes = -1; +#ifdef CONFIG_ZEN_INTERACTIVE +static int watermark_boost_factor __read_mostly; +#else static int watermark_boost_factor __read_mostly = 15000; +#endif static int watermark_scale_factor = 10; int defrag_mode; @@ -4640,6 +4646,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned int cpuset_mems_cookie; unsigned int zonelist_iter_cookie; int reserve_flags; + bool woke_kswapd = false; if (unlikely(nofail)) { /* @@ -4699,8 +4706,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, goto nopage; } - if (alloc_flags & ALLOC_KSWAPD) + if (alloc_flags & ALLOC_KSWAPD) { + if (!woke_kswapd) { + atomic_long_inc(&kswapd_waiters); + woke_kswapd = true; + } wake_all_kswapds(order, gfp_mask, ac); + } /* * The adjusted alloc_flags might result in immediate success, so try @@ -4915,9 +4927,12 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, goto retry; } fail: - warn_alloc(gfp_mask, ac->nodemask, - "page allocation failure: order:%u", order); got_pg: + if (woke_kswapd) + atomic_long_dec(&kswapd_waiters); + if (!page) + warn_alloc(gfp_mask, ac->nodemask, + "page allocation failure: order:%u", order); return page; } diff --git a/mm/swap.c b/mm/swap.c index 2260dcd2775e75..3a7aefc524e56a 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1101,6 +1101,10 @@ static const struct ctl_table swap_sysctl_table[] = { */ void __init swap_setup(void) { +#ifdef CONFIG_ZEN_INTERACTIVE + /* Only swap-in pages requested, avoid readahead */ + page_cluster = 0; +#else unsigned long megs = PAGES_TO_MB(totalram_pages()); /* Use a smaller cluster for small-memory machines */ @@ -1112,6 +1116,7 @@ void __init swap_setup(void) * Right now other parts of the system means that we * _really_ don't want to cluster much more */ +#endif register_sysctl_init("vm", swap_sysctl_table); } diff --git a/mm/vmscan.c b/mm/vmscan.c index b2fc8b626d3dff..3f0cfcf9cdf410 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -6484,7 +6484,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, return 0; } -static bool allow_direct_reclaim(pg_data_t *pgdat) +static bool allow_direct_reclaim(pg_data_t *pgdat, bool using_kswapd) { struct zone *zone; unsigned long pfmemalloc_reserve = 0; @@ -6509,6 +6509,10 @@ static bool allow_direct_reclaim(pg_data_t *pgdat) wmark_ok = free_pages > pfmemalloc_reserve / 2; + /* The throttled direct reclaimer is now a kswapd waiter */ + if (unlikely(!using_kswapd && !wmark_ok)) + atomic_long_inc(&kswapd_waiters); + /* kswapd must be awake if processes are being throttled */ if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL) @@ -6574,7 +6578,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, /* Throttle based on the first usable node */ pgdat = zone->zone_pgdat; - if (allow_direct_reclaim(pgdat)) + if (allow_direct_reclaim(pgdat, gfp_mask & __GFP_KSWAPD_RECLAIM)) goto out; break; } @@ -6596,11 +6600,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, */ if (!(gfp_mask & __GFP_FS)) wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, - allow_direct_reclaim(pgdat), HZ); + allow_direct_reclaim(pgdat, true), HZ); else /* Throttle until kswapd wakes the process */ wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, - allow_direct_reclaim(pgdat)); + allow_direct_reclaim(pgdat, true)); + + if (unlikely(!(gfp_mask & __GFP_KSWAPD_RECLAIM))) + atomic_long_dec(&kswapd_waiters); if (fatal_signal_pending(current)) return true; @@ -7130,14 +7137,14 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) * able to safely make forward progress. Wake them */ if (waitqueue_active(&pgdat->pfmemalloc_wait) && - allow_direct_reclaim(pgdat)) + allow_direct_reclaim(pgdat, true)) wake_up_all(&pgdat->pfmemalloc_wait); /* Check if kswapd should be suspending */ __fs_reclaim_release(_THIS_IP_); ret = kthread_freezable_should_stop(&was_frozen); __fs_reclaim_acquire(_THIS_IP_); - if (was_frozen || ret) + if (was_frozen || ret || !atomic_long_read(&kswapd_waiters)) break; /*