Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions arch/x86/include/asm/topology.h
Original file line number Diff line number Diff line change
Expand Up @@ -245,4 +245,6 @@ void init_freq_invariance_cppc(void);
#define arch_init_invariance_cppc init_freq_invariance_cppc
#endif

extern int arch_sched_node_distance(int from, int to);

#endif /* _ASM_X86_TOPOLOGY_H */
70 changes: 70 additions & 0 deletions arch/x86/kernel/smpboot.c
Original file line number Diff line number Diff line change
Expand Up @@ -647,6 +647,76 @@ static void __init build_sched_topology(void)
set_sched_topology(x86_topology);
}

#ifdef CONFIG_NUMA
static int sched_avg_remote_distance;
static int avg_remote_numa_distance(void)
{
int i, j;
int distance, nr_remote, total_distance;

if (sched_avg_remote_distance > 0)
return sched_avg_remote_distance;

nr_remote = 0;
total_distance = 0;
for_each_node_state(i, N_CPU) {
for_each_node_state(j, N_CPU) {
distance = node_distance(i, j);

if (distance >= REMOTE_DISTANCE) {
nr_remote++;
total_distance += distance;
}
}
}
if (nr_remote)
sched_avg_remote_distance = total_distance / nr_remote;
else
sched_avg_remote_distance = REMOTE_DISTANCE;

return sched_avg_remote_distance;
}

int arch_sched_node_distance(int from, int to)
{
int d = node_distance(from, to);

switch (boot_cpu_data.x86_vfm) {
case INTEL_GRANITERAPIDS_X:
case INTEL_ATOM_DARKMONT_X:

if (!x86_has_numa_in_package || topology_max_packages() == 1 ||
d < REMOTE_DISTANCE)
return d;

/*
* With SNC enabled, there could be too many levels of remote
* NUMA node distances, creating NUMA domain levels
* including local nodes and partial remote nodes.
*
* Trim finer distance tuning for NUMA nodes in remote package
* for the purpose of building sched domains. Group NUMA nodes
* in the remote package in the same sched group.
* Simplify NUMA domains and avoid extra NUMA levels including
* different remote NUMA nodes and local nodes.
*
* GNR and CWF don't expect systems with more than 2 packages
* and more than 2 hops between packages. Single average remote
* distance won't be appropriate if there are more than 2
* packages as average distance to different remote packages
* could be different.
*/
WARN_ONCE(topology_max_packages() > 2,
"sched: Expect only up to 2 packages for GNR or CWF, "
"but saw %d packages when building sched domains.",
topology_max_packages());

d = avg_remote_numa_distance();
}
return d;
}
#endif /* CONFIG_NUMA */

void set_cpu_sibling_map(int cpu)
{
bool has_smt = smp_num_siblings > 1;
Expand Down
3 changes: 3 additions & 0 deletions include/linux/bitmap.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#include <linux/align.h>
#include <linux/bitops.h>
#include <linux/cleanup.h>
#include <linux/find.h>
#include <linux/limits.h>
#include <linux/string.h>
Expand Down Expand Up @@ -129,6 +130,8 @@ unsigned long *bitmap_alloc_node(unsigned int nbits, gfp_t flags, int node);
unsigned long *bitmap_zalloc_node(unsigned int nbits, gfp_t flags, int node);
void bitmap_free(const unsigned long *bitmap);

DEFINE_FREE(bitmap, unsigned long *, if (_T) bitmap_free(_T))

/* Managed variants of the above. */
unsigned long *devm_bitmap_alloc(struct device *dev,
unsigned int nbits, gfp_t flags);
Expand Down
108 changes: 86 additions & 22 deletions kernel/sched/topology.c
Original file line number Diff line number Diff line change
Expand Up @@ -1536,11 +1536,18 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
#ifdef CONFIG_NUMA
enum numa_topology_type sched_numa_topology_type;

/*
* sched_domains_numa_distance is derived from sched_numa_node_distance
* and provides a simplified view of NUMA distances used specifically
* for building NUMA scheduling domains.
*/
static int sched_domains_numa_levels;
static int sched_numa_node_levels;
static int sched_domains_curr_level;

int sched_max_numa_distance;
static int *sched_domains_numa_distance;
static int *sched_numa_node_distance;
static struct cpumask ***sched_domains_numa_masks;
#endif

Expand Down Expand Up @@ -1755,10 +1762,10 @@ bool find_numa_distance(int distance)
return true;

rcu_read_lock();
distances = rcu_dereference(sched_domains_numa_distance);
distances = rcu_dereference(sched_numa_node_distance);
if (!distances)
goto unlock;
for (i = 0; i < sched_domains_numa_levels; i++) {
for (i = 0; i < sched_numa_node_levels; i++) {
if (distances[i] == distance) {
found = true;
break;
Expand Down Expand Up @@ -1834,32 +1841,51 @@ static void init_numa_topology_type(int offline_node)

#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)

void sched_init_numa(int offline_node)
/*
* An architecture could modify its NUMA distance, to change
* grouping of NUMA nodes and number of NUMA levels when creating
* NUMA level sched domains.
*
* A NUMA level is created for each unique
* arch_sched_node_distance.
*/
static int numa_node_dist(int i, int j)
{
struct sched_domain_topology_level *tl;
unsigned long *distance_map;
return node_distance(i, j);
}

int arch_sched_node_distance(int from, int to)
__weak __alias(numa_node_dist);

static bool modified_sched_node_distance(void)
{
return numa_node_dist != arch_sched_node_distance;
}

static int sched_record_numa_dist(int offline_node, int (*n_dist)(int, int),
int **dist, int *levels)
{
unsigned long *distance_map __free(bitmap) = NULL;
int nr_levels = 0;
int i, j;
int *distances;
struct cpumask ***masks;

/*
* O(nr_nodes^2) deduplicating selection sort -- in order to find the
* unique distances in the node_distance() table.
*/
distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL);
if (!distance_map)
return;
return -ENOMEM;

bitmap_zero(distance_map, NR_DISTANCE_VALUES);
for_each_cpu_node_but(i, offline_node) {
for_each_cpu_node_but(j, offline_node) {
int distance = node_distance(i, j);
int distance = n_dist(i, j);

if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) {
sched_numa_warn("Invalid distance value range");
bitmap_free(distance_map);
return;
return -EINVAL;
}

bitmap_set(distance_map, distance, 1);
Expand All @@ -1872,18 +1898,46 @@ void sched_init_numa(int offline_node)
nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES);

distances = kcalloc(nr_levels, sizeof(int), GFP_KERNEL);
if (!distances) {
bitmap_free(distance_map);
return;
}
if (!distances)
return -ENOMEM;

for (i = 0, j = 0; i < nr_levels; i++, j++) {
j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j);
distances[i] = j;
}
rcu_assign_pointer(sched_domains_numa_distance, distances);
*dist = distances;
*levels = nr_levels;

return 0;
}

void sched_init_numa(int offline_node)
{
struct sched_domain_topology_level *tl;
int nr_levels, nr_node_levels;
int i, j;
int *distances, *domain_distances;
struct cpumask ***masks;

bitmap_free(distance_map);
/* Record the NUMA distances from SLIT table */
if (sched_record_numa_dist(offline_node, numa_node_dist, &distances,
&nr_node_levels))
return;

/* Record modified NUMA distances for building sched domains */
if (modified_sched_node_distance()) {
if (sched_record_numa_dist(offline_node, arch_sched_node_distance,
&domain_distances, &nr_levels)) {
kfree(distances);
return;
}
} else {
domain_distances = distances;
nr_levels = nr_node_levels;
}
rcu_assign_pointer(sched_numa_node_distance, distances);
WRITE_ONCE(sched_max_numa_distance, distances[nr_node_levels - 1]);
WRITE_ONCE(sched_numa_node_levels, nr_node_levels);

/*
* 'nr_levels' contains the number of unique distances
Expand All @@ -1901,6 +1955,8 @@ void sched_init_numa(int offline_node)
*
* We reset it to 'nr_levels' at the end of this function.
*/
rcu_assign_pointer(sched_domains_numa_distance, domain_distances);

sched_domains_numa_levels = 0;

masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL);
Expand All @@ -1926,10 +1982,13 @@ void sched_init_numa(int offline_node)
masks[i][j] = mask;

for_each_cpu_node_but(k, offline_node) {
if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))
if (sched_debug() &&
(arch_sched_node_distance(j, k) !=
arch_sched_node_distance(k, j)))
sched_numa_warn("Node-distance not symmetric");

if (node_distance(j, k) > sched_domains_numa_distance[i])
if (arch_sched_node_distance(j, k) >
sched_domains_numa_distance[i])
continue;

cpumask_or(mask, mask, cpumask_of_node(k));
Expand Down Expand Up @@ -1978,22 +2037,25 @@ void sched_init_numa(int offline_node)
sched_domain_topology = tl;

sched_domains_numa_levels = nr_levels;
WRITE_ONCE(sched_max_numa_distance, sched_domains_numa_distance[nr_levels - 1]);

init_numa_topology_type(offline_node);
}


static void sched_reset_numa(void)
{
int nr_levels, *distances;
int nr_levels, *distances, *dom_distances = NULL;
struct cpumask ***masks;

nr_levels = sched_domains_numa_levels;
sched_numa_node_levels = 0;
sched_domains_numa_levels = 0;
sched_max_numa_distance = 0;
sched_numa_topology_type = NUMA_DIRECT;
distances = sched_domains_numa_distance;
distances = sched_numa_node_distance;
if (sched_numa_node_distance != sched_domains_numa_distance)
dom_distances = sched_domains_numa_distance;
rcu_assign_pointer(sched_numa_node_distance, NULL);
rcu_assign_pointer(sched_domains_numa_distance, NULL);
masks = sched_domains_numa_masks;
rcu_assign_pointer(sched_domains_numa_masks, NULL);
Expand All @@ -2002,6 +2064,7 @@ static void sched_reset_numa(void)

synchronize_rcu();
kfree(distances);
kfree(dom_distances);
for (i = 0; i < nr_levels && masks; i++) {
if (!masks[i])
continue;
Expand Down Expand Up @@ -2048,7 +2111,8 @@ void sched_domains_numa_masks_set(unsigned int cpu)
continue;

/* Set ourselves in the remote node's masks */
if (node_distance(j, node) <= sched_domains_numa_distance[i])
if (arch_sched_node_distance(j, node) <=
sched_domains_numa_distance[i])
cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
}
}
Expand Down