kakra · kakra · Jun 23, 2015 · Jan 11, 2016 · Mar 14, 2016 · Mar 14, 2016
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
@@ -175,6 +175,7 @@ extern unsigned int __read_mostly sysctl_sched_itmt_enabled;
 
 /* Interface to set priority of a cpu */
 void sched_set_itmt_core_prio(int prio, int core_cpu);
+void sched_set_itmt_power_ratio(int power_ratio, int core_cpu);
 
 /* Interface to notify scheduler that system supports ITMT */
 int sched_set_itmt_support(void);

diff --git a/arch/x86/include/asm/vdso/processor.h b/arch/x86/include/asm/vdso/processor.h
@@ -10,7 +10,7 @@
 /* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
 static __always_inline void rep_nop(void)
 {
-	asm volatile("rep; nop" ::: "memory");
+	asm volatile("lfence" ::: "memory");
 }
 
 static __always_inline void cpu_relax(void)

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
@@ -143,6 +143,8 @@ obj-$(CONFIG_AMD_MEM_ENCRYPT)		+= sev.o
 
 obj-$(CONFIG_CFI_CLANG)			+= cfi.o
 
+obj-y				+= powerbump.o
+
 ###
 # 64 bit specific files
 ifeq ($(CONFIG_X86_64),y)

diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c
@@ -166,6 +166,10 @@ static ssize_t energy_perf_bias_store(struct device *dev,
 	if (ret < 0)
 		return ret;
 
+	/* update the ITMT scheduler logic to use the power policy data */
+	/* scale the val up by 2 so the range is 224 - 256 */
+	sched_set_itmt_power_ratio(256 - val * 2, cpu);
+
 	return count;
 }
 

diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
@@ -44,6 +44,8 @@
 
 static struct microcode_ops	*microcode_ops;
 static bool dis_ucode_ldr = true;
+bool ucode_rollback = false;
+int enable_rollback = 0;
 
 bool initrd_gone;
 
@@ -80,6 +82,26 @@ static u32 final_levels[] = {
 	0, /* T-101 terminator */
 };
 
+static int __init ucode_setup(char *str)
+{
+	if (!str)
+		return -EINVAL;
+
+	while (*str) {
+		if (!strncmp(str, "rollback", 8)) {
+			enable_rollback = 1;
+			pr_info("Microcode Rollback Enabled\n");
+		}
+		str += strcspn(str, ",");
+		while (*str == ',')
+			str++;
+	}
+	return 0;
+}
+
+__setup("ucode=", ucode_setup);
+
+
 /*
  * Check the current patch level on this CPU.
  *
@@ -513,6 +535,7 @@ static ssize_t reload_store(struct device *dev,
 			    struct device_attribute *attr,
 			    const char *buf, size_t size)
 {
+	struct cpuinfo_x86 *c = &boot_cpu_data;
 	enum ucode_state tmp_ret = UCODE_OK;
 	int bsp = boot_cpu_data.cpu_index;
 	unsigned long val;
@@ -522,14 +545,28 @@ static ssize_t reload_store(struct device *dev,
 	if (ret)
 		return ret;
 
-	if (val != 1)
+	if (!val || val > 2)
 		return size;
 
 	cpus_read_lock();
 
 	ret = check_online_cpus();
 	if (ret)
 		goto put;
+	/*
+	 * Check if the vendor is Intel to permit reloading
+	 * microcode even if the revision is unchanged.
+	 * This is typically used during development of microcode
+	 * and changing rev is a pain.
+	 */
+	if ((val == 2) && ((c->x86_vendor != X86_VENDOR_INTEL) ||
+	     !enable_rollback))
+		return size;
+	else if (val == 2) {
+		mutex_lock(&microcode_mutex);
+		ucode_rollback = true;
+		mutex_unlock(&microcode_mutex);
+	}
 
 	tmp_ret = microcode_ops->request_microcode_fw(bsp, &microcode_pdev->dev, true);
 	if (tmp_ret != UCODE_NEW)
@@ -540,6 +577,7 @@ static ssize_t reload_store(struct device *dev,
 	mutex_unlock(&microcode_mutex);
 
 put:
+	ucode_rollback = false;
 	cpus_read_unlock();
 
 	if (ret == 0)

diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
@@ -44,6 +44,7 @@ static struct microcode_intel *intel_ucode_patch;
 
 /* last level cache size per core */
 static int llc_size_per_core;
+extern bool ucode_rollback;
 
 /*
  * Returns 1 if update has been found, 0 otherwise.
@@ -80,7 +81,7 @@ static int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev
 {
 	struct microcode_header_intel *mc_hdr = mc;
 
-	if (mc_hdr->rev <= new_rev)
+	if (!ucode_rollback && mc_hdr->rev <= new_rev)
 		return 0;
 
 	return find_matching_signature(mc, csig, cpf);
@@ -120,7 +121,7 @@ static void save_microcode_patch(struct ucode_cpu_info *uci, void *data, unsigne
 		if (find_matching_signature(data, sig, pf)) {
 			prev_found = true;
 
-			if (mc_hdr->rev <= mc_saved_hdr->rev)
+			if (!ucode_rollback && mc_hdr->rev <= mc_saved_hdr->rev)
 				continue;
 
 			p = memdup_patch(data, size);
@@ -649,7 +650,7 @@ static struct microcode_intel *find_patch(struct ucode_cpu_info *uci)
 
 		phdr = (struct microcode_header_intel *)iter->data;
 
-		if (phdr->rev <= uci->cpu_sig.rev)
+		if (!ucode_rollback && phdr->rev <= uci->cpu_sig.rev)
 			continue;
 
 		if (!find_matching_signature(phdr,
@@ -734,10 +735,11 @@ static enum ucode_state apply_microcode_intel(int cpu)
 	 * already.
 	 */
 	rev = intel_get_microcode_revision();
-	if (rev >= mc->hdr.rev) {
+	if (!ucode_rollback && rev >= mc->hdr.rev) {
 		ret = UCODE_OK;
 		goto out;
-	}
+	} else if (ucode_rollback)
+		ret = UCODE_OK;
 
 	/*
 	 * Writeback and invalidate caches before updating microcode to avoid
@@ -756,7 +758,7 @@ static enum ucode_state apply_microcode_intel(int cpu)
 		return UCODE_ERROR;
 	}
 
-	if (bsp && rev != prev_rev) {
+	if (bsp && ((rev != prev_rev) || ucode_rollback)) {
 		pr_info("updated to revision 0x%x, date = %04x-%02x-%02x\n",
 			rev,
 			mc->hdr.date & 0xffff,

diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c
@@ -25,6 +25,7 @@
 
 static DEFINE_MUTEX(itmt_update_mutex);
 DEFINE_PER_CPU_READ_MOSTLY(int, sched_core_priority);
+DEFINE_PER_CPU_READ_MOSTLY(int, sched_power_ratio);
 
 /* Boolean to track if system has ITMT capabilities */
 static bool __read_mostly sched_itmt_capable;
@@ -169,37 +170,50 @@ void sched_clear_itmt_support(void)
 
 int arch_asym_cpu_priority(int cpu)
 {
-	return per_cpu(sched_core_priority, cpu);
+	int power_ratio = per_cpu(sched_power_ratio, cpu);
+
+	/* a power ratio of 0 (uninitialized) is assumed to be maximum */
+	if (power_ratio == 0)
+		power_ratio = 256 - 2 * 6;
+	return per_cpu(sched_core_priority, cpu) * power_ratio / 256;
 }
 
 /**
  * sched_set_itmt_core_prio() - Set CPU priority based on ITMT
- * @prio:	Priority of cpu core
- * @core_cpu:	The cpu number associated with the core
+ * @prio:	Priority of @cpu
+ * @cpu:	The CPU number
  *
  * The pstate driver will find out the max boost frequency
  * and call this function to set a priority proportional
- * to the max boost frequency. CPU with higher boost
+ * to the max boost frequency. CPUs with higher boost
  * frequency will receive higher priority.
  *
  * No need to rebuild sched domain after updating
  * the CPU priorities. The sched domains have no
  * dependency on CPU priorities.
  */
-void sched_set_itmt_core_prio(int prio, int core_cpu)
+void sched_set_itmt_core_prio(int prio, int cpu)
+{
+	per_cpu(sched_core_priority, cpu) = prio * 64 - cpu;
+}
+
+/**
+ * sched_set_itmt_power_ratio() - Set CPU priority based on ITMT
+ * @power_ratio:	The power scaling ratio [1..256] for the core
+ * @core_cpu:		The cpu number associated with the core
+ *
+ * Set a scaling to the cpu performance based on long term power
+ * settings (like EPB).
+ *
+ * Note this is for the policy not for the actual dynamic frequency;
+ * the frequency will increase itself as workloads run on a core.
+ */
+
+void sched_set_itmt_power_ratio(int power_ratio, int core_cpu)
 {
-	int cpu, i = 1;
+	int cpu;
 
 	for_each_cpu(cpu, topology_sibling_cpumask(core_cpu)) {
-		int smt_prio;
-
-		/*
-		 * Ensure that the siblings are moved to the end
-		 * of the priority chain and only used when
-		 * all other high priority cpus are out of capacity.
-		 */
-		smt_prio = prio * smp_num_siblings / (i * i);
-		per_cpu(sched_core_priority, cpu) = smt_prio;
-		i++;
+		per_cpu(sched_power_ratio, cpu) = power_ratio;
 	}
 }
diff --git a/arch/x86/kernel/powerbump.c b/arch/x86/kernel/powerbump.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *  Copyright (C) 2023 Intel Corporation
+ *  Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * Kernel power-bump infrastructructure
+ */
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/jiffies.h>
+
+static DEFINE_PER_CPU(unsigned long, bump_timeout); /* jiffies at which the lease for the bump times out */
+
+
+
+/*
+ * a note about the use of the current cpu versus preemption.
+ *
+ * Most uses of in_power_bump() are inside local power management code,
+ * and are pinned to that cpu already.
+ *
+ * On the "set" side, interrupt level code is obviously also fully
+ * migration-race free.
+ *
+ * All other cases are exposed to a migration-race.
+ * 
+ * The goal of powerbump is statistical rather than deterministic,
+ * e.g. on average the CPU that hits event X will go towards Y more
+ * often than not, and the impact of being wrong is a bit of extra
+ * power potentially for some short durations.
+ * Weighted against the costs in performance and complexity of dealing
+ * with the race, the race condition is acceptable.
+ *
+ * The second known race is where interrupt context might set a bump
+ * time in the middle of process context setting a different but smaller bump time,
+ * with the result that process context will win incorrectly, and the
+ * actual bump time will be less than expected, but still non-zero.
+ * Here also the cost of dealing with the raice is outweight with the
+ * limited impact.
+ */
+
+
+int in_power_bump(void)
+{
+	int cpu = raw_smp_processor_id();
+	if (time_before(jiffies, per_cpu(bump_timeout, cpu)))
+		return 1;
+
+	/* deal with wrap issues by keeping the stored bump value close to current */
+	per_cpu(bump_timeout, cpu) = jiffies;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(in_power_bump);
+
+void give_power_bump(int msecs)
+{
+	unsigned long nextjiffies;
+	int cpu;
+	/* we need to round up an extra jiffie */
+	nextjiffies = jiffies + msecs_to_jiffies(msecs) + 1;
+
+	cpu = raw_smp_processor_id();
+	if (time_before(per_cpu(bump_timeout, cpu), nextjiffies))
+		 per_cpu(bump_timeout, cpu) = nextjiffies;
+
+}
+EXPORT_SYMBOL_GPL(give_power_bump);
+
+static __init int powerbump_init(void)
+{
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu) {
+		per_cpu(bump_timeout, cpu) = jiffies;
+	}
+
+	return 0;
+}
+
+late_initcall(powerbump_init);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
@@ -1569,6 +1569,9 @@ unsigned long calibrate_delay_is_known(void)
 	if (!constant_tsc || !mask)
 		return 0;
 
+	if (cpu != 0)
+		return cpu_data(0).loops_per_jiffy;
+
 	sibling = cpumask_any_but(mask, cpu);
 	if (sibling < nr_cpu_ids)
 		return cpu_data(sibling).loops_per_jiffy;

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
@@ -799,9 +799,9 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
 	if (!printk_ratelimit())
 		return;
 
-	printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx",
+	printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx cpu %i",
 		loglvl, tsk->comm, task_pid_nr(tsk), address,
-		(void *)regs->ip, (void *)regs->sp, error_code);
+		(void *)regs->ip, (void *)regs->sp, error_code, raw_smp_processor_id());
 
 	print_vma_addr(KERN_CONT " in ", regs->ip);
 

diff --git a/block/bio.c b/block/bio.c
@@ -19,6 +19,7 @@
 #include <linux/sched/sysctl.h>
 #include <linux/blk-crypto.h>
 #include <linux/xarray.h>
+#include <linux/powerbump.h>
 
 #include <trace/events/block.h>
 #include "blk.h"
@@ -1294,6 +1295,7 @@ EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages);
 
 static void submit_bio_wait_endio(struct bio *bio)
 {
+	give_power_bump(BUMP_FOR_DISK);
 	complete(bio->bi_private);
 }
 
@@ -1319,6 +1321,8 @@ int submit_bio_wait(struct bio *bio)
 	bio->bi_opf |= REQ_SYNC;
 	submit_bio(bio);
 
+	give_power_bump(BUMP_FOR_DISK);
+
 	/* Prevent hang_check timer from firing at us during very long I/O */
 	hang_check = sysctl_hung_task_timeout_secs;
 	if (hang_check)