Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
e6e8b21
i8042: decrease debug message level to info
fenrus75 Jun 23, 2015
16ba690
increase the ext4 default commit age
fenrus75 Jan 11, 2016
78aae73
silence rapl
fenrus75 Mar 14, 2016
f243bb0
pci pme wakeups
fenrus75 Mar 14, 2016
9071085
ksm-wakeups
fenrus75 May 6, 2019
cd4b867
intel_idle: tweak cpuidle cstates
fenrus75 Mar 20, 2016
d768ec7
smpboot: reuse timer calibration
fenrus75 Feb 11, 2015
ee7c9cb
initialize ata before graphics
fenrus75 Jun 3, 2016
df6ed5e
ipv4/tcp: allow the memory tuning for tcp to go a little bigger than …
fenrus75 Jan 6, 2017
010e3c9
init: wait for partition and retry scan
fenrus75 May 17, 2017
b202143
enable stateless firmware loading
Jun 20, 2018
d7589d0
migrate some systemd defaults to the kernel defaults.
ahkok Aug 2, 2018
fc80743
use lfence instead of rep and nop
fenrus75 Dec 8, 2018
44b41a3
do accept() in LIFO order for cache efficiency
fenrus75 Dec 13, 2018
6a6df01
locking: rwsem: spin faster
fenrus75 Feb 18, 2018
0548879
ata: libahci: ignore staggered spin-up
thac0 Jun 25, 2019
3fafb0f
print CPU that faults
fenrus75 Aug 10, 2019
66dc1ca
x86/microcode: Add an option to reload microcode even if revision is …
Aug 19, 2021
202c459
nvme workaround
fenrus75 Nov 11, 2019
451db4b
don't report an error if PowerClamp run on other CPU
AKoskovich Feb 12, 2020
93c895d
itmt_epb: use epb to scale itmt
fenrus75 Nov 16, 2021
1ab5b88
itmt2 ADL fixes
spandruvada Nov 18, 2021
abf6581
add a per cpu minimum high watermark an tune batch size
fenrus75 Nov 23, 2021
9fa32b9
scale
bwarden Sep 19, 2022
0471ed8
sched/fair: Simplify asym_packing logic for SMT sched groups
ricardon Aug 25, 2022
08777e6
sched/fair: Let lower-priority CPUs do active balancing
ricardon Aug 25, 2022
73171cb
x86/sched: Avoid unnecessary migrations within SMT domains
ricardon Aug 25, 2022
a4b7c0e
powerbump functionality
fenrus75 Jan 4, 2023
66c6adf
add networking support for powerbump
fenrus75 Jan 5, 2023
7ee30f8
futex bump
fenrus75 Jan 12, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions arch/x86/include/asm/topology.h
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,7 @@ extern unsigned int __read_mostly sysctl_sched_itmt_enabled;

/* Interface to set priority of a cpu */
void sched_set_itmt_core_prio(int prio, int core_cpu);
void sched_set_itmt_power_ratio(int power_ratio, int core_cpu);

/* Interface to notify scheduler that system supports ITMT */
int sched_set_itmt_support(void);
Expand Down
2 changes: 1 addition & 1 deletion arch/x86/include/asm/vdso/processor.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
static __always_inline void rep_nop(void)
{
asm volatile("rep; nop" ::: "memory");
asm volatile("lfence" ::: "memory");
}

static __always_inline void cpu_relax(void)
Expand Down
2 changes: 2 additions & 0 deletions arch/x86/kernel/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,8 @@ obj-$(CONFIG_AMD_MEM_ENCRYPT) += sev.o

obj-$(CONFIG_CFI_CLANG) += cfi.o

obj-y += powerbump.o

###
# 64 bit specific files
ifeq ($(CONFIG_X86_64),y)
Expand Down
4 changes: 4 additions & 0 deletions arch/x86/kernel/cpu/intel_epb.c
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,10 @@ static ssize_t energy_perf_bias_store(struct device *dev,
if (ret < 0)
return ret;

/* update the ITMT scheduler logic to use the power policy data */
/* scale the val up by 2 so the range is 224 - 256 */
sched_set_itmt_power_ratio(256 - val * 2, cpu);

return count;
}

Expand Down
40 changes: 39 additions & 1 deletion arch/x86/kernel/cpu/microcode/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@

static struct microcode_ops *microcode_ops;
static bool dis_ucode_ldr = true;
bool ucode_rollback = false;
int enable_rollback = 0;

bool initrd_gone;

Expand Down Expand Up @@ -80,6 +82,26 @@ static u32 final_levels[] = {
0, /* T-101 terminator */
};

static int __init ucode_setup(char *str)
{
if (!str)
return -EINVAL;

while (*str) {
if (!strncmp(str, "rollback", 8)) {
enable_rollback = 1;
pr_info("Microcode Rollback Enabled\n");
}
str += strcspn(str, ",");
while (*str == ',')
str++;
}
return 0;
}

__setup("ucode=", ucode_setup);


/*
* Check the current patch level on this CPU.
*
Expand Down Expand Up @@ -513,6 +535,7 @@ static ssize_t reload_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t size)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
enum ucode_state tmp_ret = UCODE_OK;
int bsp = boot_cpu_data.cpu_index;
unsigned long val;
Expand All @@ -522,14 +545,28 @@ static ssize_t reload_store(struct device *dev,
if (ret)
return ret;

if (val != 1)
if (!val || val > 2)
return size;

cpus_read_lock();

ret = check_online_cpus();
if (ret)
goto put;
/*
* Check if the vendor is Intel to permit reloading
* microcode even if the revision is unchanged.
* This is typically used during development of microcode
* and changing rev is a pain.
*/
if ((val == 2) && ((c->x86_vendor != X86_VENDOR_INTEL) ||
!enable_rollback))
return size;
else if (val == 2) {
mutex_lock(&microcode_mutex);
ucode_rollback = true;
mutex_unlock(&microcode_mutex);
}

tmp_ret = microcode_ops->request_microcode_fw(bsp, &microcode_pdev->dev, true);
if (tmp_ret != UCODE_NEW)
Expand All @@ -540,6 +577,7 @@ static ssize_t reload_store(struct device *dev,
mutex_unlock(&microcode_mutex);

put:
ucode_rollback = false;
cpus_read_unlock();

if (ret == 0)
Expand Down
14 changes: 8 additions & 6 deletions arch/x86/kernel/cpu/microcode/intel.c
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ static struct microcode_intel *intel_ucode_patch;

/* last level cache size per core */
static int llc_size_per_core;
extern bool ucode_rollback;

/*
* Returns 1 if update has been found, 0 otherwise.
Expand Down Expand Up @@ -80,7 +81,7 @@ static int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev
{
struct microcode_header_intel *mc_hdr = mc;

if (mc_hdr->rev <= new_rev)
if (!ucode_rollback && mc_hdr->rev <= new_rev)
return 0;

return find_matching_signature(mc, csig, cpf);
Expand Down Expand Up @@ -120,7 +121,7 @@ static void save_microcode_patch(struct ucode_cpu_info *uci, void *data, unsigne
if (find_matching_signature(data, sig, pf)) {
prev_found = true;

if (mc_hdr->rev <= mc_saved_hdr->rev)
if (!ucode_rollback && mc_hdr->rev <= mc_saved_hdr->rev)
continue;

p = memdup_patch(data, size);
Expand Down Expand Up @@ -649,7 +650,7 @@ static struct microcode_intel *find_patch(struct ucode_cpu_info *uci)

phdr = (struct microcode_header_intel *)iter->data;

if (phdr->rev <= uci->cpu_sig.rev)
if (!ucode_rollback && phdr->rev <= uci->cpu_sig.rev)
continue;

if (!find_matching_signature(phdr,
Expand Down Expand Up @@ -734,10 +735,11 @@ static enum ucode_state apply_microcode_intel(int cpu)
* already.
*/
rev = intel_get_microcode_revision();
if (rev >= mc->hdr.rev) {
if (!ucode_rollback && rev >= mc->hdr.rev) {
ret = UCODE_OK;
goto out;
}
} else if (ucode_rollback)
ret = UCODE_OK;

/*
* Writeback and invalidate caches before updating microcode to avoid
Expand All @@ -756,7 +758,7 @@ static enum ucode_state apply_microcode_intel(int cpu)
return UCODE_ERROR;
}

if (bsp && rev != prev_rev) {
if (bsp && ((rev != prev_rev) || ucode_rollback)) {
pr_info("updated to revision 0x%x, date = %04x-%02x-%02x\n",
rev,
mc->hdr.date & 0xffff,
Expand Down
46 changes: 30 additions & 16 deletions arch/x86/kernel/itmt.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

static DEFINE_MUTEX(itmt_update_mutex);
DEFINE_PER_CPU_READ_MOSTLY(int, sched_core_priority);
DEFINE_PER_CPU_READ_MOSTLY(int, sched_power_ratio);

/* Boolean to track if system has ITMT capabilities */
static bool __read_mostly sched_itmt_capable;
Expand Down Expand Up @@ -169,37 +170,50 @@ void sched_clear_itmt_support(void)

int arch_asym_cpu_priority(int cpu)
{
return per_cpu(sched_core_priority, cpu);
int power_ratio = per_cpu(sched_power_ratio, cpu);

/* a power ratio of 0 (uninitialized) is assumed to be maximum */
if (power_ratio == 0)
power_ratio = 256 - 2 * 6;
return per_cpu(sched_core_priority, cpu) * power_ratio / 256;
}

/**
* sched_set_itmt_core_prio() - Set CPU priority based on ITMT
* @prio: Priority of cpu core
* @core_cpu: The cpu number associated with the core
* @prio: Priority of @cpu
* @cpu: The CPU number
*
* The pstate driver will find out the max boost frequency
* and call this function to set a priority proportional
* to the max boost frequency. CPU with higher boost
* to the max boost frequency. CPUs with higher boost
* frequency will receive higher priority.
*
* No need to rebuild sched domain after updating
* the CPU priorities. The sched domains have no
* dependency on CPU priorities.
*/
void sched_set_itmt_core_prio(int prio, int core_cpu)
void sched_set_itmt_core_prio(int prio, int cpu)
{
per_cpu(sched_core_priority, cpu) = prio * 64 - cpu;
}

/**
* sched_set_itmt_power_ratio() - Set CPU priority based on ITMT
* @power_ratio: The power scaling ratio [1..256] for the core
* @core_cpu: The cpu number associated with the core
*
* Set a scaling to the cpu performance based on long term power
* settings (like EPB).
*
* Note this is for the policy not for the actual dynamic frequency;
* the frequency will increase itself as workloads run on a core.
*/

void sched_set_itmt_power_ratio(int power_ratio, int core_cpu)
{
int cpu, i = 1;
int cpu;

for_each_cpu(cpu, topology_sibling_cpumask(core_cpu)) {
int smt_prio;

/*
* Ensure that the siblings are moved to the end
* of the priority chain and only used when
* all other high priority cpus are out of capacity.
*/
smt_prio = prio * smp_num_siblings / (i * i);
per_cpu(sched_core_priority, cpu) = smt_prio;
i++;
per_cpu(sched_power_ratio, cpu) = power_ratio;
}
}
80 changes: 80 additions & 0 deletions arch/x86/kernel/powerbump.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2023 Intel Corporation
* Author: Arjan van de Ven <arjan@linux.intel.com>
*
* Kernel power-bump infrastructructure
*/
#include <linux/kernel.h>
#include <linux/percpu.h>
#include <linux/jiffies.h>

static DEFINE_PER_CPU(unsigned long, bump_timeout); /* jiffies at which the lease for the bump times out */



/*
* a note about the use of the current cpu versus preemption.
*
* Most uses of in_power_bump() are inside local power management code,
* and are pinned to that cpu already.
*
* On the "set" side, interrupt level code is obviously also fully
* migration-race free.
*
* All other cases are exposed to a migration-race.
*
* The goal of powerbump is statistical rather than deterministic,
* e.g. on average the CPU that hits event X will go towards Y more
* often than not, and the impact of being wrong is a bit of extra
* power potentially for some short durations.
* Weighted against the costs in performance and complexity of dealing
* with the race, the race condition is acceptable.
*
* The second known race is where interrupt context might set a bump
* time in the middle of process context setting a different but smaller bump time,
* with the result that process context will win incorrectly, and the
* actual bump time will be less than expected, but still non-zero.
* Here also the cost of dealing with the raice is outweight with the
* limited impact.
*/


int in_power_bump(void)
{
int cpu = raw_smp_processor_id();
if (time_before(jiffies, per_cpu(bump_timeout, cpu)))
return 1;

/* deal with wrap issues by keeping the stored bump value close to current */
per_cpu(bump_timeout, cpu) = jiffies;
return 0;
}
EXPORT_SYMBOL_GPL(in_power_bump);

void give_power_bump(int msecs)
{
unsigned long nextjiffies;
int cpu;
/* we need to round up an extra jiffie */
nextjiffies = jiffies + msecs_to_jiffies(msecs) + 1;

cpu = raw_smp_processor_id();
if (time_before(per_cpu(bump_timeout, cpu), nextjiffies))
per_cpu(bump_timeout, cpu) = nextjiffies;

}
EXPORT_SYMBOL_GPL(give_power_bump);

static __init int powerbump_init(void)
{
unsigned int cpu;

for_each_possible_cpu(cpu) {
per_cpu(bump_timeout, cpu) = jiffies;
}

return 0;
}

late_initcall(powerbump_init);
3 changes: 3 additions & 0 deletions arch/x86/kernel/tsc.c
Original file line number Diff line number Diff line change
Expand Up @@ -1569,6 +1569,9 @@ unsigned long calibrate_delay_is_known(void)
if (!constant_tsc || !mask)
return 0;

if (cpu != 0)
return cpu_data(0).loops_per_jiffy;

sibling = cpumask_any_but(mask, cpu);
if (sibling < nr_cpu_ids)
return cpu_data(sibling).loops_per_jiffy;
Expand Down
4 changes: 2 additions & 2 deletions arch/x86/mm/fault.c
Original file line number Diff line number Diff line change
Expand Up @@ -799,9 +799,9 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
if (!printk_ratelimit())
return;

printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx",
printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx cpu %i",
loglvl, tsk->comm, task_pid_nr(tsk), address,
(void *)regs->ip, (void *)regs->sp, error_code);
(void *)regs->ip, (void *)regs->sp, error_code, raw_smp_processor_id());

print_vma_addr(KERN_CONT " in ", regs->ip);

Expand Down
4 changes: 4 additions & 0 deletions block/bio.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <linux/sched/sysctl.h>
#include <linux/blk-crypto.h>
#include <linux/xarray.h>
#include <linux/powerbump.h>

#include <trace/events/block.h>
#include "blk.h"
Expand Down Expand Up @@ -1294,6 +1295,7 @@ EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages);

static void submit_bio_wait_endio(struct bio *bio)
{
give_power_bump(BUMP_FOR_DISK);
complete(bio->bi_private);
}

Expand All @@ -1319,6 +1321,8 @@ int submit_bio_wait(struct bio *bio)
bio->bi_opf |= REQ_SYNC;
submit_bio(bio);

give_power_bump(BUMP_FOR_DISK);

/* Prevent hang_check timer from firing at us during very long I/O */
hang_check = sysctl_hung_task_timeout_secs;
if (hang_check)
Expand Down
Loading