Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 49 additions & 4 deletions include/linux/mm.h
Original file line number Diff line number Diff line change
Expand Up @@ -2644,7 +2644,19 @@ static inline bool get_user_page_fast_only(unsigned long addr,
*/
static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
{
return percpu_counter_read_positive(&mm->rss_stat[member]);
struct percpu_counter *fbc = &mm->rss_stat[member];

if (percpu_counter_initialized(fbc))
return percpu_counter_read_positive(fbc);

long val = percpu_counter_atomic_read(fbc);
/*
* counter is updated in asynchronous manner and may go to minus.
* But it's never be expected number for users.
*/
if (val < 0)
return 0;
return (unsigned long)val;
}

static inline unsigned long get_mm_counter_sum(struct mm_struct *mm, int member)
Expand All @@ -2656,7 +2668,12 @@ void mm_trace_rss_stat(struct mm_struct *mm, int member);

static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
{
percpu_counter_add(&mm->rss_stat[member], value);
struct percpu_counter *fbc = &mm->rss_stat[member];

if (percpu_counter_initialized(fbc))
percpu_counter_add(fbc, value);
else
percpu_counter_atomic_add(fbc, value);

mm_trace_rss_stat(mm, member);
}
Expand All @@ -2670,9 +2687,37 @@ static inline void inc_mm_counter(struct mm_struct *mm, int member)

static inline void dec_mm_counter(struct mm_struct *mm, int member)
{
percpu_counter_dec(&mm->rss_stat[member]);
add_mm_counter(mm, member, -1);
}

mm_trace_rss_stat(mm, member);
static inline s64 mm_counter_sum(struct mm_struct *mm, int member)
{
struct percpu_counter *fbc = &mm->rss_stat[member];

if (percpu_counter_initialized(fbc))
return percpu_counter_sum(fbc);

return percpu_counter_atomic_read(fbc);
}

static inline s64 mm_counter_sum_positive(struct mm_struct *mm, int member)
{
struct percpu_counter *fbc = &mm->rss_stat[member];

if (percpu_counter_initialized(fbc))
return percpu_counter_sum_positive(fbc);

return percpu_counter_atomic_read(fbc);
Comment on lines +2706 to +2710
Copy link

Copilot AI Jan 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For atomic mode counters, percpu_counter_atomic_read can return negative values, but mm_counter_sum_positive should return only positive values (as the name implies). The percpu mode uses percpu_counter_sum_positive which clamps to 0, but the atomic mode path doesn't apply this logic. Consider using max(0, percpu_counter_atomic_read(fbc)) to ensure consistency.

Suggested change
if (percpu_counter_initialized(fbc))
return percpu_counter_sum_positive(fbc);
return percpu_counter_atomic_read(fbc);
s64 sum;
if (percpu_counter_initialized(fbc))
return percpu_counter_sum_positive(fbc);
sum = percpu_counter_atomic_read(fbc);
return sum < 0 ? 0 : sum;

Copilot uses AI. Check for mistakes.
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For atomic mode counters, percpu_counter_atomic_read can return negative values, but mm_counter_sum_positive should return only positive values (as the name implies). The percpu mode uses percpu_counter_sum_positive which clamps to 0, but the atomic mode path doesn't apply this logic. Consider using max(0, percpu_counter_atomic_read(fbc)) to ensure consistency.

这个函数只在include/trace/events/kmem.h里面使用

}

static inline int mm_counter_switch_to_pcpu(struct mm_struct *mm)
{
return percpu_counter_switch_to_pcpu_many(mm->rss_stat, NR_MM_COUNTERS);
}

static inline void mm_counter_destroy(struct mm_struct *mm)
{
percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS);
}

/* Optimized variant when folio is already known not to be anon */
Expand Down
48 changes: 45 additions & 3 deletions include/linux/percpu_counter.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,18 @@

struct percpu_counter {
raw_spinlock_t lock;
s64 count;
/*
* Depending on whether counters is NULL, we can support two modes,
* atomic mode using count_atomic and percpu mode using count.
* The single-thread processes should use atomic mode to reduce the
* memory consumption and performance regression.
* The multiple-thread processes should use percpu mode to reduce the
* error margin.
*/
union {
s64 count;
atomic64_t count_atomic;
};
#ifdef CONFIG_HOTPLUG_CPU
struct list_head list; /* All percpu_counters are on a list */
#endif
Expand All @@ -32,14 +43,14 @@ extern int percpu_counter_batch;

int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount,
gfp_t gfp, u32 nr_counters,
struct lock_class_key *key);
struct lock_class_key *key, bool switch_mode);

#define percpu_counter_init_many(fbc, value, gfp, nr_counters) \
({ \
static struct lock_class_key __key; \
\
__percpu_counter_init_many(fbc, value, gfp, nr_counters,\
&__key); \
&__key, false); \
})


Expand Down Expand Up @@ -130,6 +141,20 @@ static inline bool percpu_counter_initialized(struct percpu_counter *fbc)
return (fbc->counters != NULL);
}

static inline s64 percpu_counter_atomic_read(struct percpu_counter *fbc)
{
return atomic64_read(&fbc->count_atomic);
}

static inline void percpu_counter_atomic_add(struct percpu_counter *fbc,
s64 amount)
{
atomic64_add(amount, &fbc->count_atomic);
}

int percpu_counter_switch_to_pcpu_many(struct percpu_counter *fbc,
u32 nr_counters);

#else /* !CONFIG_SMP */

struct percpu_counter {
Expand Down Expand Up @@ -260,6 +285,23 @@ static inline bool percpu_counter_initialized(struct percpu_counter *fbc)
static inline void percpu_counter_sync(struct percpu_counter *fbc)
{
}

static inline s64 percpu_counter_atomic_read(struct percpu_counter *fbc)
{
return fbc->count;
}

static inline void percpu_counter_atomic_add(struct percpu_counter *fbc,
s64 amount)
{
percpu_counter_add(fbc, amount);
}

static inline int percpu_counter_switch_to_pcpu_many(struct percpu_counter *fbc,
u32 nr_counters)
{
return 0;
}
#endif /* CONFIG_SMP */

static inline void percpu_counter_inc(struct percpu_counter *fbc)
Expand Down
4 changes: 2 additions & 2 deletions include/trace/events/kmem.h
Original file line number Diff line number Diff line change
Expand Up @@ -399,8 +399,8 @@ TRACE_EVENT(rss_stat,
__entry->mm_id = mm_ptr_to_hash(mm);
__entry->curr = !!(current->mm == mm);
__entry->member = member;
__entry->size = (percpu_counter_sum_positive(&mm->rss_stat[member])
<< PAGE_SHIFT);
__entry->size = (mm_counter_sum_positive(mm, member)
<< PAGE_SHIFT);
),

TP_printk("mm_id=%u curr=%d type=%s size=%ldB",
Expand Down
20 changes: 12 additions & 8 deletions kernel/fork.c
Original file line number Diff line number Diff line change
Expand Up @@ -829,7 +829,7 @@ static void check_mm(struct mm_struct *mm)
"Please make sure 'struct resident_page_types[]' is updated as well");

for (i = 0; i < NR_MM_COUNTERS; i++) {
long x = percpu_counter_sum(&mm->rss_stat[i]);
long x = mm_counter_sum(mm, i);

if (unlikely(x))
pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
Expand Down Expand Up @@ -930,7 +930,7 @@ void __mmdrop(struct mm_struct *mm)
put_user_ns(mm->user_ns);
mm_pasid_drop(mm);
mm_destroy_cid(mm);
percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS);
mm_counter_destroy(mm);

free_mm(mm);
}
Expand Down Expand Up @@ -1308,16 +1308,10 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
if (mm_alloc_cid(mm))
goto fail_cid;

if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT,
NR_MM_COUNTERS))
goto fail_pcpu;

mm->user_ns = get_user_ns(user_ns);
lru_gen_init_mm(mm);
return mm;

fail_pcpu:
mm_destroy_cid(mm);
fail_cid:
destroy_context(mm);
fail_nocontext:
Expand Down Expand Up @@ -1741,6 +1735,16 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
if (!oldmm)
return 0;

/*
* For single-thread processes, rss_stat is in atomic mode, which
* reduces the memory consumption and performance regression caused by
* using percpu. For multiple-thread processes, rss_stat is switched to
* the percpu mode to reduce the error margin.
*/
if (clone_flags & CLONE_THREAD)
if (mm_counter_switch_to_pcpu(oldmm))
return -ENOMEM;

if (clone_flags & CLONE_VM) {
mmget(oldmm);
mm = oldmm;
Expand Down
35 changes: 33 additions & 2 deletions lib/percpu_counter.c
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ EXPORT_SYMBOL(__percpu_counter_sum);

Copy link

Copilot AI Jan 14, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The switch_mode parameter in __percpu_counter_init_many lacks documentation. When switch_mode is true, the function is being used to convert an existing atomic-mode counter to percpu mode, and the count value should not be reset. A comment should be added to document this parameter and its purpose.

Suggested change
/*
* Initialize an array of percpu counters.
*
* @switch_mode: when true, the caller is converting an existing
* atomic-mode counter to percpu mode. In that case, the existing
* global count in @fbc[i].count must be preserved and therefore
* is not reset to @amount.
*/

Copilot uses AI. Check for mistakes.
int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount,
gfp_t gfp, u32 nr_counters,
struct lock_class_key *key)
struct lock_class_key *key, bool switch_mode)
{
unsigned long flags __maybe_unused;
size_t counter_size;
Expand All @@ -208,7 +208,8 @@ int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount,
#ifdef CONFIG_HOTPLUG_CPU
INIT_LIST_HEAD(&fbc[i].list);
#endif
fbc[i].count = amount;
if (likely(!switch_mode))
fbc[i].count = amount;
Copy link

Copilot AI Jan 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When switching from atomic mode to percpu mode, the existing value in count_atomic needs to be migrated to the percpu counter's count field. Currently, when switch_mode is true, fbc[i].count is not initialized at all, which means the accumulated atomic count value is lost. The function should read the current atomic64_read(&fbc[i].count_atomic) value and assign it to fbc[i].count before switching modes to preserve the accumulated RSS statistics.

Suggested change
fbc[i].count = amount;
fbc[i].count = amount;
else
fbc[i].count = atomic64_read(&fbc[i].count_atomic);

Copilot uses AI. Check for mistakes.
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

count 和 count_atomic 是 union,它们在内存中是重叠的。不需要这个操作

fbc[i].counters = (void __percpu *)counters + i * counter_size;

debug_percpu_counter_activate(&fbc[i]);
Expand Down Expand Up @@ -312,6 +313,36 @@ int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch)
}
EXPORT_SYMBOL(__percpu_counter_compare);

/*
* percpu_counter_switch_to_pcpu_many: Converts struct percpu_counters from
* atomic mode to percpu mode.
*
* Return: 0 if percpu_counter is already in atomic mode or successfully
* switched to atomic mode; -ENOMEM if percpu memory allocation fails,
* percpu_counter is still in atomic mode.
*/
int percpu_counter_switch_to_pcpu_many(struct percpu_counter *fbc,
u32 nr_counters)
{
static struct lock_class_key __key;
unsigned long flags;
int ret = 0;

if (percpu_counter_initialized(fbc))
return 0;

preempt_disable();
local_irq_save(flags);
if (likely(!percpu_counter_initialized(fbc)))
ret = __percpu_counter_init_many(fbc, 0,
GFP_ATOMIC|__GFP_NOWARN|__GFP_ZERO,
nr_counters, &__key, true);
Comment on lines +331 to +339
Copy link

Copilot AI Jan 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is a potential race condition here. The check at line 331 and line 336 both use percpu_counter_initialized(fbc) without proper synchronization between them. If two threads call this function simultaneously, both could see the counter as uninitialized and attempt to initialize it, leading to a memory leak or inconsistent state. Consider using atomic compare-and-swap or ensuring this function is only called under appropriate locks.

Copilot uses AI. Check for mistakes.
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

考虑调用的地方是第一次初始化的时候才会percpu_counter_initialized,第一次初始化的时候是fork,始终为单线程进入此处初始化后才可能多线程。

Comment on lines +324 to +339
Copy link

Copilot AI Jan 14, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code uses inconsistent indentation with tabs mixed in. The function parameters and code within the function should use consistent indentation matching the rest of the file (tabs for indentation).

Copilot uses AI. Check for mistakes.
local_irq_restore(flags);
preempt_enable();
Comment on lines +334 to +341
Copy link

Copilot AI Jan 14, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The mode switching in percpu_counter_switch_to_pcpu_many has inadequate synchronization. While it disables preemption and interrupts, there's no memory barrier after the percpu allocation and before other CPUs can observe the new counters pointer. This could lead to other CPUs seeing a non-NULL counters pointer before the percpu memory is fully initialized. An smp_wmb() should be added after successful initialization and smp_rmb() in code paths that check percpu_counter_initialized().

Copilot uses AI. Check for mistakes.
Comment on lines +328 to +341
Copy link

Copilot AI Jan 14, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using local_irq_save with GFP_ATOMIC for percpu allocation is problematic. The allocation uses GFP_ATOMIC which can fail more easily than GFP_KERNEL. Since this is called from copy_mm which is in process context and can sleep, a better approach would be to do the allocation without disabling interrupts (using GFP_KERNEL), then use a lock to atomically check if initialization is still needed and set the counters pointer. The current approach makes memory allocation more likely to fail and returns -ENOMEM to the thread creation, which is unnecessarily fragile.

Suggested change
unsigned long flags;
int ret = 0;
if (percpu_counter_initialized(fbc))
return 0;
preempt_disable();
local_irq_save(flags);
if (likely(!percpu_counter_initialized(fbc)))
ret = __percpu_counter_init_many(fbc, 0,
GFP_ATOMIC|__GFP_NOWARN|__GFP_ZERO,
nr_counters, &__key, true);
local_irq_restore(flags);
preempt_enable();
int ret = 0;
if (percpu_counter_initialized(fbc))
return 0;
if (likely(!percpu_counter_initialized(fbc)))
ret = __percpu_counter_init_many(fbc, 0,
GFP_KERNEL|__GFP_NOWARN|__GFP_ZERO,
nr_counters, &__key, true);

Copilot uses AI. Check for mistakes.

return ret;
}

/*
* Compare counter, and add amount if total is: less than or equal to limit if
* amount is positive, or greater than or equal to limit if amount is negative.
Expand Down